You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/19 18:18:54 UTC
svn commit: r1533782 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang: ./
src/java/org/apache/lucene/analysis/ko/dic/
src/java/org/apache/lucene/analysis/ko/utils/
src/resources/org/apache/lucene/analysis/ko/dic/
src/test/org/apache/lucene/...
Author: rmuir
Date: Sat Oct 19 16:18:54 2013
New Revision: 1533782
URL: http://svn.apache.org/r1533782
Log:
LUCENE-4965: don't expose this as a large hashmap with thousands of arrays
Added:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat (with props)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx (with props)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java (with props)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
- copied, changed from r1533695, lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java
Removed:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/mapHanja.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml Sat Oct 19 16:18:54 2013
@@ -35,10 +35,11 @@
<!-- for rebuilding dictionary -->
<path id="tools.classpath">
<path refid="classpath"/>
+ <pathelement location="${build.dir}/classes/java"/>
<pathelement location="${build.dir}/classes/tools"/>
</path>
- <target name="compile-tools" depends="common.compile-tools">
+ <target name="compile-tools" depends="common.compile-tools,compile-core">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
@@ -56,7 +57,7 @@
<delete verbose="true">
<fileset dir="${dict.target.dir}" includes="**/*"/>
</delete>
- <java fork="true" failonerror="true" maxmemory="256M" classname="org.apache.lucene.analysis.ko.DictionaryBuilder">
+ <java fork="true" failonerror="true" maxmemory="256M" classname="org.apache.lucene.analysis.ko.dic.DictionaryBuilder">
<classpath>
<path refid="tools.classpath"/>
</classpath>
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Sat Oct 19 16:18:54 2013
@@ -49,8 +49,9 @@ public class DictionaryResources {
public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
- public static final String FILE_MAP_HANJA_DIC = "mapHanja.dic";
-
+ public static final String FILE_HANJA_IDX = "hanja.idx";
+ public static final String FILE_HANJA_DAT = "hanja.dat";
+ public static final int DATA_VERSION = 0;
private DictionaryResources() {}
@@ -86,5 +87,4 @@ public class DictionaryResources {
processor.processLine(line);
}
}
-
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java Sat Oct 19 16:18:54 2013
@@ -18,44 +18,60 @@ package org.apache.lucene.analysis.ko.ut
*/
import java.io.IOException;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
+import java.io.InputStream;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
-import org.apache.lucene.analysis.ko.dic.LineProcessor;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
public class HanjaUtils {
private HanjaUtils() {}
- private static final Map<Character, char[]> mapHanja;
+ private static final int HANJA_START = 0x3400;
+ private static final MonotonicBlockPackedReader index;
+ private static final char[] data;
static {
+ InputStream datStream = null, idxStream = null;
try {
- final Map<Character, char[]> map = new HashMap<Character, char[]>();
- DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC, new LineProcessor() {
- @Override
- public void processLine(String s) throws IOException {
- String[] hanInfos = s.split("[,]+");
- if(hanInfos.length!=2 || hanInfos[0].length()!=1)
- throw new IOException("Invalid file format: "+s);
-
- map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
- }
- });
- mapHanja = Collections.unmodifiableMap(map);
+ datStream = DictionaryResources.class.getResourceAsStream(DictionaryResources.FILE_HANJA_DAT);
+ idxStream = DictionaryResources.class.getResourceAsStream(DictionaryResources.FILE_HANJA_IDX);
+ DataInput dat = new InputStreamDataInput(datStream);
+ DataInput idx = new InputStreamDataInput(idxStream);
+ CodecUtil.checkHeader(dat, DictionaryResources.FILE_HANJA_DAT, DictionaryResources.DATA_VERSION, DictionaryResources.DATA_VERSION);
+ CodecUtil.checkHeader(idx, DictionaryResources.FILE_HANJA_IDX, DictionaryResources.DATA_VERSION, DictionaryResources.DATA_VERSION);
+ data = new char[dat.readVInt()];
+ for (int i = 0; i < data.length; i++) {
+ data[i] = (char) dat.readShort();
+ assert Character.UnicodeBlock.of(data[i]) == Character.UnicodeBlock.HANGUL_SYLLABLES;
+ }
+ index = new MonotonicBlockPackedReader(idx, idx.readVInt(), idx.readVInt(), idx.readVInt(), false);
} catch (IOException ioe) {
- throw new Error("Cannot load: " + DictionaryResources.FILE_MAP_HANJA_DIC, ioe);
+ throw new Error(ioe);
+ } finally {
+ IOUtils.closeWhileHandlingException(datStream, idxStream);
}
}
- /**
- * íìì ëìíë íê¸ì ì°¾ìì ë°ííë¤.
- * íëì íìë ì¬ë¬ ìì¼ë¡ ì½ì¼ ì ìì¼ë¯ë¡ ê°ë¥í 모ë ìì íê¸ë¡ ë°ííë¤.
- */
+ /**
+ * Returns array of hangul pronunciations.
+ * TODO: expose this in another way */
public static char[] convertToHangul(char hanja) {
-// if(hanja>0x9FFF||hanja<0x3400) return new char[]{hanja};
-
- final char[] result = mapHanja.get(hanja);
- return (result==null) ? new char[]{hanja} : result;
+ if (hanja < HANJA_START) {
+ return new char[] { hanja };
+ } else {
+ int idx = hanja - HANJA_START;
+ int start = (int) index.get(idx);
+ int end = (int) index.get(idx+1);
+ if (end - start == 0) {
+ return new char[] { hanja };
+ } else {
+ char result[] = new char[end - start];
+ System.arraycopy(data, start, result, 0, end - start);
+ return result;
+ }
+ }
}
}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat?rev=1533782&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx?rev=1533782&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java?rev=1533782&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java Sat Oct 19 16:18:54 2013
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.ko.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHanjaUtils extends LuceneTestCase {
+
+ public void testOneToOne() {
+ assertEquals("구", new String(HanjaUtils.convertToHangul('ã')));
+ assertEquals("ì¸", new String(HanjaUtils.convertToHangul('å°')));
+ }
+
+ public void testOneToMany() {
+ assertEquals("기ì§", new String(HanjaUtils.convertToHangul('æ³')));
+ }
+
+ public void testOutOfBounds() {
+ assertEquals("\u33FF", new String(HanjaUtils.convertToHangul('\u33FF')));
+ assertEquals("A", new String(HanjaUtils.convertToHangul('A')));
+ assertEquals("\uFF09", new String(HanjaUtils.convertToHangul('\uFF09')));
+ }
+
+ public void testEitherHangulOrItselfBack() {
+ for (int i = 0; i <= 0xFFFF; i++) {
+ char res[] = HanjaUtils.convertToHangul((char)i);
+ if (res.length == 1 && res[0] == i) {
+ continue;
+ } else {
+ assert res.length > 0;
+ for (int j = 0; j < res.length; j++) {
+ assertEquals(Character.UnicodeBlock.HANGUL_SYLLABLES, Character.UnicodeBlock.of(res[j]));
+ }
+ }
+ }
+ }
+}
Copied: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (from r1533695, lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?p2=lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java&p1=lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java&r1=1533695&r2=1533782&rev=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Sat Oct 19 16:18:54 2013
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.ko;
+package org.apache.lucene.analysis.ko.dic;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -19,16 +19,35 @@ package org.apache.lucene.analysis.ko;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
public class DictionaryBuilder {
public static void main(String args[]) throws Exception {
- String FILES_AS_IS[] = { "compounds.dic", "dictionary.dic", "eomi.dic", "extension.dic", "josa.dic", "mapHanja.dic",
- "prefix.dic", "suffix.dic", "syllable.dic", "uncompounds.dic" };
+ String FILES_AS_IS[] = {
+ DictionaryResources.FILE_COMPOUNDS,
+ DictionaryResources.FILE_DICTIONARY,
+ DictionaryResources.FILE_EOMI,
+ DictionaryResources.FILE_EXTENSION,
+ DictionaryResources.FILE_JOSA,
+ DictionaryResources.FILE_PREFIX,
+ DictionaryResources.FILE_SUFFIX,
+ DictionaryResources.FILE_SYLLABLE_FEATURE,
+ DictionaryResources.FILE_UNCOMPOUNDS
+ };
+
File inputDir = new File(args[0]);
File outputDir = new File(args[1]);
for (String file : FILES_AS_IS) {
@@ -36,6 +55,7 @@ public class DictionaryBuilder {
File out = new File(outputDir, file);
copyAsIs(in, out);
}
+ buildHanjaMap(inputDir, outputDir);
}
static void copyAsIs(File in, File out) throws Exception {
@@ -48,4 +68,56 @@ public class DictionaryBuilder {
r.close();
w.close();
}
+
+ static void buildHanjaMap(File inputDir, File outputDir) throws Exception {
+ final int HANJA_START = 0x3400;
+ final int IDX_SIZE = 0x10000 - HANJA_START;
+ OutputStream idxStream = new BufferedOutputStream(new FileOutputStream(new File(outputDir, DictionaryResources.FILE_HANJA_IDX)));
+ DataOutput idx = new OutputStreamDataOutput(idxStream);
+ CodecUtil.writeHeader(idx, DictionaryResources.FILE_HANJA_IDX, DictionaryResources.DATA_VERSION);
+ idx.writeVInt(PackedInts.VERSION_CURRENT);
+ idx.writeVInt(1024);
+ idx.writeVInt(IDX_SIZE+1); // CJK: first half of unicode, compat: at the end. but monotonic's blocking works here (?)
+ MonotonicBlockPackedWriter idxArray = new MonotonicBlockPackedWriter(idx, 1024);
+
+ OutputStream datStream = new BufferedOutputStream(new FileOutputStream(new File(outputDir, DictionaryResources.FILE_HANJA_DAT)));
+ DataOutput dat = new OutputStreamDataOutput(datStream);
+ CodecUtil.writeHeader(dat, DictionaryResources.FILE_HANJA_DAT, DictionaryResources.DATA_VERSION);
+ char datArray[] = new char[256];
+ File input = new File(inputDir, "mapHanja.dic");
+ BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(input, IOUtils.CHARSET_UTF_8));
+ int currentInput = -1;
+ int currentOutput = 0;
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if (line.startsWith("!")) {
+ continue;
+ }
+ int cp = line.charAt(0) - HANJA_START;
+ while (currentInput < cp) {
+ idxArray.add(currentOutput);
+ currentInput++;
+ }
+ String mappings = line.substring(2);
+ for (int i = 0; i < mappings.length(); i++) {
+ if (currentOutput == datArray.length) {
+ datArray = ArrayUtil.grow(datArray);
+ }
+ datArray[currentOutput] = mappings.charAt(i);
+ currentOutput++;
+ }
+ currentInput = cp;
+ }
+ while (currentInput < IDX_SIZE) {
+ idxArray.add(currentOutput);
+ currentInput++;
+ }
+ idxArray.finish();
+ dat.writeVInt(currentOutput);
+ for (int i = 0; i < currentOutput; i++) {
+ dat.writeShort((short) datArray[i]);
+ }
+ idxStream.close();
+ datStream.close();
+ }
}