You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/18 08:11:30 UTC

svn commit: r1533329 [3/3] - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: resources/org/apache/lucene/analysis/ko/dic/ tools/java/org/apache/lucene/analysis/ko/

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java?rev=1533329&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java Fri Oct 18 06:11:29 2013
@@ -0,0 +1,166 @@
+package org.apache.lucene.analysis.ko;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.net.URL;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.zip.ZipFile;
+
+/**
+ * Generates a mapping from single hanja to a set of possible hangul pronunciations.
+ * <p>
+ * This is used by KoreanFilter.analysisChinese() to perform substitutions and look
+ * for dictionary entries.
+ */
+public class GenerateHanjaMap {
+  
+  // change this to where you want the stuff to go
+  static final File output = new File("/home/rmuir/workspace/lucene-clean-trunk/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/mapHanja.dic");
+  private static final String NL = System.getProperty("line.separator");
+
+  public static void main(String args[]) throws Exception {
+    // inefficient but we dont care
+    Map<Character,Set<Character>> mappings = new TreeMap<>();
+    addIMEMappings(mappings);
+    addUnihanMappings(mappings);
+    addOOMappings(mappings);
+    // print statistics
+    System.out.println("# hanja keys: " + mappings.size());
+    int kvpairs = 0;
+    for (Set<Character> hangul : mappings.values()) {
+      kvpairs += hangul.size();
+    }
+    System.out.println("# hanja/hangul mappings: " + kvpairs);
+    
+    // write license
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
+    BufferedReader licenseFile = new BufferedReader(new InputStreamReader(GenerateHanjaMap.class.getResourceAsStream("hanjamap.license.txt"), "UTF-8"));
+    String line = null;
+    while ((line = licenseFile.readLine()) != null) {
+      writer.write(line);
+      writer.write(NL);
+    }
+    licenseFile.close();
+    
+    // write out the mappings
+    for (Character k : mappings.keySet()) {
+      writer.write(k);
+      writer.write(',');
+      for (Character v : mappings.get(k)) {
+        writer.write(v);
+      }
+      writer.write(NL);
+    }
+    writer.close();
+  }
+  
+  static String IME_URL = "http://google-input-tools.googlecode.com/git/src/chrome/os/nacl-hangul/misc/hanja.txt";
+  static void addIMEMappings(Map<Character,Set<Character>> mappings) throws Exception {
+    BufferedReader r = new BufferedReader(new InputStreamReader(new URL(IME_URL).openStream(), "UTF-8"));
+    String line = null;
+    while ((line = r.readLine()) != null) {
+      if (!line.startsWith("#") && line.length() > 0) {
+        String tokens[] = line.split(":");
+        if (tokens[1].length() == 1) {
+          char k = tokens[1].charAt(0);
+          if (tokens[0].length() != 1) {
+            throw new RuntimeException();
+          }
+          char v = tokens[0].charAt(0);
+          add(mappings, k, v);
+        }
+      }
+    }
+    r.close();
+  }
+  
+  static String OO_URL = "http://svn.apache.org/repos/asf/openoffice/trunk/main/i18npool/source/textconversion/data/hhc_char.dic";
+  static void addOOMappings(Map<Character,Set<Character>> mappings) throws Exception {
+    BufferedReader r = new BufferedReader(new InputStreamReader(new URL(OO_URL).openStream(), "UTF-8"));
+    String line = null;
+    while ((line = r.readLine()) != null) {
+      String fields[] = line.split(":");
+      if (fields.length != 2) {
+        throw new RuntimeException();
+      }
+      if (fields[0].length() != 1) {
+        throw new RuntimeException();
+      }
+      char v = fields[0].charAt(0);
+      for (int i = 0; i < fields[1].length(); i++) {
+        add(mappings, fields[1].charAt(i), v);
+      }
+    }
+    r.close();
+  }
+  
+  static String UNIHAN_URL = "http://www.unicode.org/Public/6.3.0/ucd/Unihan.zip";
+  static void addUnihanMappings(Map<Character,Set<Character>> mappings) throws Exception {
+    URL url = new URL(UNIHAN_URL);
+    ReadableByteChannel in = Channels.newChannel(url.openStream());
+    File tmp = File.createTempFile("unihan", "zip");
+    FileOutputStream out = new FileOutputStream(tmp);
+    out.getChannel().transferFrom(in, 0, Long.MAX_VALUE);
+    out.close();
+    in.close();
+    ZipFile zip = new ZipFile(tmp);
+    BufferedReader r = new BufferedReader(new InputStreamReader(zip.getInputStream(zip.getEntry("Unihan_Readings.txt")), "UTF-8"));
+    String line = null;
+    while ((line = r.readLine()) != null) {
+      if (!line.startsWith("#") && line.length() > 0) {
+        String fields[] = line.split("\t");
+        if (fields[1].equals("kHangul")) {
+          int codepoint = Integer.parseInt(fields[0].substring(2), 16);
+          if (codepoint > 0xFFFF) {
+            throw new RuntimeException();
+          }
+          String readings[] = fields[2].split("\\s+");
+          for (String reading : readings) {
+            if (reading.length() != 1) {
+              throw new RuntimeException();
+            }
+            add(mappings, (char)codepoint, reading.charAt(0));
+          }
+        }
+      }
+    }
+    r.close();
+    zip.close();
+    tmp.delete();
+  }
+  
+  static void add(Map<Character,Set<Character>> mappings, char k, char v) {
+    Set<Character> current = mappings.get(k);
+    if (current == null) {
+      current = new TreeSet<Character>();
+      mappings.put(k, current);
+    }
+    current.add(v);
+  }
+}

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt?rev=1533329&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt Fri Oct 18 06:11:29 2013
@@ -0,0 +1,76 @@
+! Some mappings (single hanja mappings only) from 
+! https://code.google.com/p/google-input-tools/source/browse/src/chrome/os/nacl-hangul/misc/hanja.txt
+! Original license:
+!
+! Copyright (c) 2005,2006 Choe Hwanjin
+! All rights reserved.
+! 
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions are met:
+! 
+! 1. Redistributions of source code must retain the above copyright notice,
+!    this list of conditions and the following disclaimer.
+! 2. Redistributions in binary form must reproduce the above copyright notice,
+!    this list of conditions and the following disclaimer in the documentation
+!    and/or other materials provided with the distribution.
+! 3. Neither the name of the author nor the names of its contributors
+!    may be used to endorse or promote products derived from this software
+!    without specific prior written permission.
+! 
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+! ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+! 
+! Additional mappings (kHangul field in Unihan_Readings.txt) from:
+! http://www.unicode.org/Public/6.3.0/ucd/Unihan.zip
+! Original license:
+! 
+! NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, 
+! INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"), 
+! AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, 
+! ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT 
+! DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+!
+! COPYRIGHT AND PERMISSION NOTICE
+!
+! Copyright (c) 1991-2013 Unicode, Inc. 
+! All rights reserved. 
+! Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+!
+! Permission is hereby granted, free of charge, to any person obtaining a copy 
+! of the Unicode data files and any associated documentation (the "Data Files") 
+! or Unicode software and any associated documentation (the "Software") to deal 
+! in the Data Files or Software without restriction, including without limitation 
+! the rights to use, copy, modify, merge, publish, distribute, and/or sell copies 
+! of the Data Files or Software, and to permit persons to whom the Data Files or 
+! Software are furnished to do so, provided that (a) the above copyright notice(s) 
+! and this permission notice appear with all copies of the Data Files or Software, 
+! (b) both the above copyright notice(s) and this permission notice appear in 
+! associated documentation, and (c) there is clear notice in each modified Data 
+! File or in the Software as well as in the documentation associated with the Data 
+! File(s) or Software that the data or software has been modified.
+!
+! THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+! EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO 
+! EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR 
+! ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES 
+! WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF 
+! CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 
+! WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+! 
+! Except as contained in this notice, the name of a copyright holder shall not be 
+! used in advertising or otherwise to promote the sale, use or other dealings in 
+! these Data Files or Software without prior written authorization of the copyright holder.
+!
+! Additional mappings from:
+! http://svn.apache.org/repos/asf/openoffice/trunk/main/i18npool/source/textconversion/data/hhc_char.dic
+! (Apache 2.0 License)