You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/18 08:11:30 UTC
svn commit: r1533329 [3/3] - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
resources/org/apache/lucene/analysis/ko/dic/
tools/java/org/apache/lucene/analysis/ko/
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java?rev=1533329&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/GenerateHanjaMap.java Fri Oct 18 06:11:29 2013
@@ -0,0 +1,166 @@
+package org.apache.lucene.analysis.ko;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.net.URL;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.zip.ZipFile;
+
+/**
+ * Generates a mapping from single hanja to a set of possible hangul pronunciations.
+ * <p>
+ * This is used by KoreanFilter.analysisChinese() to perform substitutions and look
+ * for dictionary entries.
+ */
+public class GenerateHanjaMap {
+
+ // change this to where you want the stuff to go
+ static final File output = new File("/home/rmuir/workspace/lucene-clean-trunk/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/mapHanja.dic");
+ private static final String NL = System.getProperty("line.separator");
+
+ public static void main(String args[]) throws Exception {
+ // inefficient but we dont care
+ Map<Character,Set<Character>> mappings = new TreeMap<>();
+ addIMEMappings(mappings);
+ addUnihanMappings(mappings);
+ addOOMappings(mappings);
+ // print statistics
+ System.out.println("# hanja keys: " + mappings.size());
+ int kvpairs = 0;
+ for (Set<Character> hangul : mappings.values()) {
+ kvpairs += hangul.size();
+ }
+ System.out.println("# hanja/hangul mappings: " + kvpairs);
+
+ // write license
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"));
+ BufferedReader licenseFile = new BufferedReader(new InputStreamReader(GenerateHanjaMap.class.getResourceAsStream("hanjamap.license.txt"), "UTF-8"));
+ String line = null;
+ while ((line = licenseFile.readLine()) != null) {
+ writer.write(line);
+ writer.write(NL);
+ }
+ licenseFile.close();
+
+ // write out the mappings
+ for (Character k : mappings.keySet()) {
+ writer.write(k);
+ writer.write(',');
+ for (Character v : mappings.get(k)) {
+ writer.write(v);
+ }
+ writer.write(NL);
+ }
+ writer.close();
+ }
+
+ static String IME_URL = "http://google-input-tools.googlecode.com/git/src/chrome/os/nacl-hangul/misc/hanja.txt";
+ static void addIMEMappings(Map<Character,Set<Character>> mappings) throws Exception {
+ BufferedReader r = new BufferedReader(new InputStreamReader(new URL(IME_URL).openStream(), "UTF-8"));
+ String line = null;
+ while ((line = r.readLine()) != null) {
+ if (!line.startsWith("#") && line.length() > 0) {
+ String tokens[] = line.split(":");
+ if (tokens[1].length() == 1) {
+ char k = tokens[1].charAt(0);
+ if (tokens[0].length() != 1) {
+ throw new RuntimeException();
+ }
+ char v = tokens[0].charAt(0);
+ add(mappings, k, v);
+ }
+ }
+ }
+ r.close();
+ }
+
+ static String OO_URL = "http://svn.apache.org/repos/asf/openoffice/trunk/main/i18npool/source/textconversion/data/hhc_char.dic";
+ static void addOOMappings(Map<Character,Set<Character>> mappings) throws Exception {
+ BufferedReader r = new BufferedReader(new InputStreamReader(new URL(OO_URL).openStream(), "UTF-8"));
+ String line = null;
+ while ((line = r.readLine()) != null) {
+ String fields[] = line.split(":");
+ if (fields.length != 2) {
+ throw new RuntimeException();
+ }
+ if (fields[0].length() != 1) {
+ throw new RuntimeException();
+ }
+ char v = fields[0].charAt(0);
+ for (int i = 0; i < fields[1].length(); i++) {
+ add(mappings, fields[1].charAt(i), v);
+ }
+ }
+ r.close();
+ }
+
+ static String UNIHAN_URL = "http://www.unicode.org/Public/6.3.0/ucd/Unihan.zip";
+ static void addUnihanMappings(Map<Character,Set<Character>> mappings) throws Exception {
+ URL url = new URL(UNIHAN_URL);
+ ReadableByteChannel in = Channels.newChannel(url.openStream());
+ File tmp = File.createTempFile("unihan", "zip");
+ FileOutputStream out = new FileOutputStream(tmp);
+ out.getChannel().transferFrom(in, 0, Long.MAX_VALUE);
+ out.close();
+ in.close();
+ ZipFile zip = new ZipFile(tmp);
+ BufferedReader r = new BufferedReader(new InputStreamReader(zip.getInputStream(zip.getEntry("Unihan_Readings.txt")), "UTF-8"));
+ String line = null;
+ while ((line = r.readLine()) != null) {
+ if (!line.startsWith("#") && line.length() > 0) {
+ String fields[] = line.split("\t");
+ if (fields[1].equals("kHangul")) {
+ int codepoint = Integer.parseInt(fields[0].substring(2), 16);
+ if (codepoint > 0xFFFF) {
+ throw new RuntimeException();
+ }
+ String readings[] = fields[2].split("\\s+");
+ for (String reading : readings) {
+ if (reading.length() != 1) {
+ throw new RuntimeException();
+ }
+ add(mappings, (char)codepoint, reading.charAt(0));
+ }
+ }
+ }
+ }
+ r.close();
+ zip.close();
+ tmp.delete();
+ }
+
+ static void add(Map<Character,Set<Character>> mappings, char k, char v) {
+ Set<Character> current = mappings.get(k);
+ if (current == null) {
+ current = new TreeSet<Character>();
+ mappings.put(k, current);
+ }
+ current.add(v);
+ }
+}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt?rev=1533329&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/hanjamap.license.txt Fri Oct 18 06:11:29 2013
@@ -0,0 +1,76 @@
+! Some mappings (single hanja mappings only) from
+! https://code.google.com/p/google-input-tools/source/browse/src/chrome/os/nacl-hangul/misc/hanja.txt
+! Original license:
+!
+! Copyright (c) 2005,2006 Choe Hwanjin
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions are met:
+!
+! 1. Redistributions of source code must retain the above copyright notice,
+! this list of conditions and the following disclaimer.
+! 2. Redistributions in binary form must reproduce the above copyright notice,
+! this list of conditions and the following disclaimer in the documentation
+! and/or other materials provided with the distribution.
+! 3. Neither the name of the author nor the names of its contributors
+! may be used to endorse or promote products derived from this software
+! without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+! ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+!
+! Additional mappings (kHangul field in Unihan_Readings.txt) from:
+! http://www.unicode.org/Public/6.3.0/ucd/Unihan.zip
+! Original license:
+!
+! NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING,
+! INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"),
+! AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY,
+! ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+! DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+!
+! COPYRIGHT AND PERMISSION NOTICE
+!
+! Copyright (c) 1991-2013 Unicode, Inc.
+! All rights reserved.
+! Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+!
+! Permission is hereby granted, free of charge, to any person obtaining a copy
+! of the Unicode data files and any associated documentation (the "Data Files")
+! or Unicode software and any associated documentation (the "Software") to deal
+! in the Data Files or Software without restriction, including without limitation
+! the rights to use, copy, modify, merge, publish, distribute, and/or sell copies
+! of the Data Files or Software, and to permit persons to whom the Data Files or
+! Software are furnished to do so, provided that (a) the above copyright notice(s)
+! and this permission notice appear with all copies of the Data Files or Software,
+! (b) both the above copyright notice(s) and this permission notice appear in
+! associated documentation, and (c) there is clear notice in each modified Data
+! File or in the Software as well as in the documentation associated with the Data
+! File(s) or Software that the data or software has been modified.
+!
+! THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+! EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO
+! EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR
+! ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+! WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+! CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+! WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+!
+! Except as contained in this notice, the name of a copyright holder shall not be
+! used in advertising or otherwise to promote the sale, use or other dealings in
+! these Data Files or Software without prior written authorization of the copyright holder.
+!
+! Additional mappings from:
+! http://svn.apache.org/repos/asf/openoffice/trunk/main/i18npool/source/textconversion/data/hhc_char.dic
+! (Apache 2.0 License)