You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/07 16:04:21 UTC
svn commit: r1228629 -
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/
Author: uschindler
Date: Sat Jan 7 15:04:21 2012
New Revision: 1228629
URL: http://svn.apache.org/viewvc?rev=1228629&view=rev
Log:
LUCENE-3305: Change class hierarchy in builders, too. This is now much more logical, the trie/targetMaps are now built inside the corresponding TokenInfoBuilder
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (with props)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1228629&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Sat Jan 7 15:04:21 2012
@@ -0,0 +1,277 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
+
+public abstract class BinaryDictionaryWriter {
+ protected final Class<? extends BinaryDictionary> implClazz;
+ protected ByteBuffer buffer;
+ private int targetMapSize = 0;
+ private int[][] targetMap = new int[8192][];
+ private int[] targetMapComponentSizes = new int[8192];
+ private final List<String> posDict = new ArrayList<String>();
+ private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
+
+ public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
+ this.implClazz = implClazz;
+ buffer = ByteBuffer.allocate(size);
+ }
+
+ /**
+ * put the entry in map
+ * @return current position of buffer, which will be wordId of next entry
+ */
+ public int put(String[] entry) {
+ short leftId = Short.parseShort(entry[1]);
+ short rightId = Short.parseShort(entry[2]);
+ short wordCost = Short.parseShort(entry[3]);
+
+ StringBuilder sb = new StringBuilder();
+
+ // build up the POS string
+ for (int i = 4; i < 8; i++) {
+ sb.append(CSVUtil.quoteEscape(entry[i]));
+ if (i < 7) {
+ sb.append(',');
+ }
+ }
+ String pos = sb.toString();
+ Integer posIndex = posDictLookup.get(pos);
+ if (posIndex == null) {
+ posIndex = posDict.size();
+ posDict.add(pos);
+ posDictLookup.put(pos, posIndex);
+ assert posDict.size() == posDictLookup.size();
+ }
+
+ // TODO: what are the parts 9 and 10 that kuromoji does not expose via Token?
+ // we need to break all these out (we can structure them inside posdict)
+
+ String baseForm = entry[10];
+ String reading = entry[11];
+ String pronunciation = entry[12];
+
+ // extend buffer if necessary
+ int left = buffer.remaining();
+ // worst case: three short, 4 bytes and features (all as utf-16)
+ int worstCase = 6 + 4 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+ if (worstCase > left) {
+ ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
+ buffer.flip();
+ newBuffer.put(buffer);
+ buffer = newBuffer;
+ }
+
+ buffer.putShort(leftId);
+ buffer.putShort(rightId);
+ buffer.putShort(wordCost);
+ assert posIndex.intValue() < 256;
+ buffer.put(posIndex.byteValue());
+
+ if (baseForm.equals(entry[0])) {
+ buffer.put((byte)0); // base form is the same as surface form
+ } else {
+ buffer.put((byte)baseForm.length());
+ for (int i = 0; i < baseForm.length(); i++) {
+ buffer.putChar(baseForm.charAt(i));
+ }
+ }
+
+ if (isKatakana(reading)) {
+ buffer.put((byte) (reading.length() << 1 | 1));
+ writeKatakana(reading);
+ } else {
+ buffer.put((byte) (reading.length() << 1));
+ for (int i = 0; i < reading.length(); i++) {
+ buffer.putChar(reading.charAt(i));
+ }
+ }
+
+ if (pronunciation.equals(reading)) {
+ buffer.put((byte)0); // pronunciation is the same as reading
+ } else {
+ if (isKatakana(pronunciation)) {
+ buffer.put((byte) (pronunciation.length() << 1 | 1));
+ writeKatakana(pronunciation);
+ } else {
+ buffer.put((byte) (pronunciation.length() << 1));
+ for (int i = 0; i < pronunciation.length(); i++) {
+ buffer.putChar(pronunciation.charAt(i));
+ }
+ }
+ }
+
+ return buffer.position();
+ }
+
+ private boolean isKatakana(String s) {
+ for (int i = 0; i < s.length(); i++) {
+ char ch = s.charAt(i);
+ if (ch < 0x30A0 || ch > 0x30FF) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void writeKatakana(String s) {
+ for (int i = 0; i < s.length(); i++) {
+ buffer.put((byte) (s.charAt(i) - 0x30A0));
+ }
+ }
+
+ public void addMapping(int sourceId, int wordId) {
+ if(targetMap.length <= sourceId) {
+ final int newSize = ArrayUtil.oversize(sourceId + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
+ int[][] newArray = new int[newSize][];
+ System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+ targetMap = newArray;
+ int[] newSizeArray = new int[newSize];
+ System.arraycopy(targetMapComponentSizes, 0, newSizeArray, 0, targetMapComponentSizes.length);
+ targetMapComponentSizes = newSizeArray;
+ }
+
+ // Prepare array -- extend the length of array
+ int[] current = targetMap[sourceId];
+ if (current == null) {
+ assert targetMapComponentSizes[sourceId] == 0;
+ current = new int[1];
+ } else {
+ current = ArrayUtil.grow(current);
+ }
+ targetMap[sourceId] = current;
+
+ int[] targets = targetMap[sourceId];
+ targets[targetMapComponentSizes[sourceId]] = wordId;
+ targetMapComponentSizes[sourceId]++;
+ targetMapSize = Math.max(targetMapSize, sourceId + 1);
+ }
+
+ protected final String getBaseFileName(String baseDir) throws IOException {
+ return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
+ }
+
+ /**
+ * Write dictionary in file
+ * Dictionary format is:
+ * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+ * @throws IOException
+ */
+ public void write(String baseDir) throws IOException {
+ final String baseName = getBaseFileName(baseDir);
+ writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
+ writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+ writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+ }
+
+ protected void writeTargetMap(String filename) throws IOException {
+ new File(filename).getParentFile().mkdirs();
+ OutputStream os = new FileOutputStream(filename);
+ try {
+ os = new BufferedOutputStream(os);
+ final DataOutput out = new OutputStreamDataOutput(os);
+ CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
+ out.writeVInt(targetMapSize);
+ int nulls = 0;
+ for (int j = 0; j < targetMapSize; j++) {
+ final int size = targetMapComponentSizes[j];
+ if (size == 0) {
+ // run-length encoding for all nulls:
+ if (nulls == 0) {
+ out.writeVInt(0);
+ }
+ nulls++;
+ } else {
+ if (nulls > 0) {
+ out.writeVInt(nulls);
+ nulls = 0;
+ }
+ final int[] a = targetMap[j];
+ assert size > 0 && size <= a.length;
+ out.writeVInt(size);
+ for (int i = 0; i < size; i++) {
+ out.writeVInt(a[i]);
+ }
+ }
+ }
+ // write the pending RLE count:
+ if (nulls > 0) {
+ out.writeVInt(nulls);
+ }
+ } finally {
+ os.close();
+ }
+ }
+
+ protected void writePosDict(String filename) throws IOException {
+ new File(filename).getParentFile().mkdirs();
+ OutputStream os = new FileOutputStream(filename);
+ try {
+ os = new BufferedOutputStream(os);
+ final DataOutput out = new OutputStreamDataOutput(os);
+ CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
+ out.writeVInt(posDict.size());
+ for (String s : posDict) {
+ out.writeString(s);
+ }
+ } finally {
+ os.close();
+ }
+ }
+
+ protected void writeDictionary(String filename) throws IOException {
+ new File(filename).getParentFile().mkdirs();
+ final FileOutputStream os = new FileOutputStream(filename);
+ try {
+ final DataOutput out = new OutputStreamDataOutput(os);
+ CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
+ out.writeVInt(buffer.position());
+ final WritableByteChannel channel = Channels.newChannel(os);
+ // Write Buffer
+ buffer.flip(); // set position to 0, set limit to current position
+ channel.write(buffer);
+ assert buffer.remaining() == 0L;
+ } finally {
+ os.close();
+ }
+ }
+
+}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1228629&r1=1228628&r2=1228629&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Sat Jan 7 15:04:21 2012
@@ -36,33 +36,19 @@ public class DictionaryBuilder {
String encoding,
boolean normalizeEntry) throws IOException {
System.out.println("building tokeninfo dict...");
- TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
+ TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
-
- System.out.print(" building double array trie...");
- DoubleArrayTrie trie = tokenInfoBuilder.buildDoubleArrayTrie();
- TokenInfoDictionaryWriter.writeDoubleArrayTrie(outputDirname, trie);
- System.out.println(" done");
-
- System.out.print(" processing target map...");
- for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
- int tokenInfoId = entry.getKey();
- String surfaceform = entry.getValue();
- int doubleArrayId = trie.lookup(surfaceform.toCharArray(), 0, surfaceform.length());
- assert doubleArrayId > 0;
- tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
- }
tokenInfoDictionary.write(outputDirname);
- trie = null;
+ tokenInfoDictionary = null;
tokenInfoBuilder = null;
-
- System.out.println(" done");
System.out.println("done");
System.out.print("building unknown word dict...");
UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
unkDictionary.write(outputDirname);
+ unkDictionary = null;
+ unkBuilder = null;
System.out.println("done");
System.out.print("building connection costs...");
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1228629&r1=1228628&r2=1228629&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Sat Jan 7 15:04:21 2012
@@ -138,6 +138,22 @@ public class TokenInfoDictionaryBuilder
}
}
+ System.out.print(" building double array trie...");
+ DoubleArrayTrie trie = buildDoubleArrayTrie();
+ dictionary.setTrie(trie);
+ System.out.println(" done");
+
+ System.out.print(" processing target map...");
+ assert trie != null;
+ for (Entry<Integer, String> entry : entrySet()) {
+ int tokenInfoId = entry.getKey();
+ String surfaceform = entry.getValue();
+ int doubleArrayId = trie.lookup(surfaceform.toCharArray(), 0, surfaceform.length());
+ assert doubleArrayId > 0;
+ dictionary.addMapping(doubleArrayId, tokenInfoId);
+ }
+ System.out.println(" done");
+
return dictionary;
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java?rev=1228629&r1=1228628&r2=1228629&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java Sat Jan 7 15:04:21 2012
@@ -17,262 +17,34 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
-import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.lucene.store.DataOutput;
-import org.apache.lucene.store.OutputStreamDataOutput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.CodecUtil;
-import org.apache.lucene.util.RamUsageEstimator;
-
-import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
-import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
-public class TokenInfoDictionaryWriter {
- protected ByteBuffer buffer;
- private int targetMapSize = 0;
- private int[][] targetMap = new int[8192][];
- private int[] targetMapComponentSizes = new int[8192];
- private final List<String> posDict = new ArrayList<String>();
- private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
+public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
+
+ private DoubleArrayTrie trie;
public TokenInfoDictionaryWriter(int size) {
- buffer = ByteBuffer.allocate(size);
- }
-
- /**
- * put the entry in map
- * @return current position of buffer, which will be wordId of next entry
- */
- public int put(String[] entry) {
- short leftId = Short.parseShort(entry[1]);
- short rightId = Short.parseShort(entry[2]);
- short wordCost = Short.parseShort(entry[3]);
-
- StringBuilder sb = new StringBuilder();
-
- // build up the POS string
- for (int i = 4; i < 8; i++) {
- sb.append(CSVUtil.quoteEscape(entry[i]));
- if (i < 7) {
- sb.append(',');
- }
- }
- String pos = sb.toString();
- Integer posIndex = posDictLookup.get(pos);
- if (posIndex == null) {
- posIndex = posDict.size();
- posDict.add(pos);
- posDictLookup.put(pos, posIndex);
- assert posDict.size() == posDictLookup.size();
- }
-
- // TODO: what are the parts 9 and 10 that kuromoji does not expose via Token?
- // we need to break all these out (we can structure them inside posdict)
-
- String baseForm = entry[10];
- String reading = entry[11];
- String pronunciation = entry[12];
-
- // extend buffer if necessary
- int left = buffer.remaining();
- // worst case: three short, 4 bytes and features (all as utf-16)
- int worstCase = 6 + 4 + 2*(baseForm.length() + reading.length() + pronunciation.length());
- if (worstCase > left) {
- ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
- buffer.flip();
- newBuffer.put(buffer);
- buffer = newBuffer;
- }
-
- buffer.putShort(leftId);
- buffer.putShort(rightId);
- buffer.putShort(wordCost);
- assert posIndex.intValue() < 256;
- buffer.put(posIndex.byteValue());
-
- if (baseForm.equals(entry[0])) {
- buffer.put((byte)0); // base form is the same as surface form
- } else {
- buffer.put((byte)baseForm.length());
- for (int i = 0; i < baseForm.length(); i++) {
- buffer.putChar(baseForm.charAt(i));
- }
- }
-
- if (isKatakana(reading)) {
- buffer.put((byte) (reading.length() << 1 | 1));
- writeKatakana(reading);
- } else {
- buffer.put((byte) (reading.length() << 1));
- for (int i = 0; i < reading.length(); i++) {
- buffer.putChar(reading.charAt(i));
- }
- }
-
- if (pronunciation.equals(reading)) {
- buffer.put((byte)0); // pronunciation is the same as reading
- } else {
- if (isKatakana(pronunciation)) {
- buffer.put((byte) (pronunciation.length() << 1 | 1));
- writeKatakana(pronunciation);
- } else {
- buffer.put((byte) (pronunciation.length() << 1));
- for (int i = 0; i < pronunciation.length(); i++) {
- buffer.putChar(pronunciation.charAt(i));
- }
- }
- }
-
- return buffer.position();
+ super(TokenInfoDictionary.class, size);
+ this.trie = trie;
}
- private boolean isKatakana(String s) {
- for (int i = 0; i < s.length(); i++) {
- char ch = s.charAt(i);
- if (ch < 0x30A0 || ch > 0x30FF) {
- return false;
- }
- }
- return true;
+ public void setTrie(DoubleArrayTrie trie) {
+ this.trie = trie;
}
- private void writeKatakana(String s) {
- for (int i = 0; i < s.length(); i++) {
- buffer.put((byte) (s.charAt(i) - 0x30A0));
- }
- }
-
- public void addMapping(int sourceId, int wordId) {
- if(targetMap.length <= sourceId) {
- final int newSize = ArrayUtil.oversize(sourceId + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
- int[][] newArray = new int[newSize][];
- System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
- targetMap = newArray;
- int[] newSizeArray = new int[newSize];
- System.arraycopy(targetMapComponentSizes, 0, newSizeArray, 0, targetMapComponentSizes.length);
- targetMapComponentSizes = newSizeArray;
- }
-
- // Prepare array -- extend the length of array
- int[] current = targetMap[sourceId];
- if (current == null) {
- assert targetMapComponentSizes[sourceId] == 0;
- current = new int[1];
- } else {
- current = ArrayUtil.grow(current);
- }
- targetMap[sourceId] = current;
-
- int[] targets = targetMap[sourceId];
- targets[targetMapComponentSizes[sourceId]] = wordId;
- targetMapComponentSizes[sourceId]++;
- targetMapSize = Math.max(targetMapSize, sourceId + 1);
- }
-
- /**
- * Write dictionary in file
- * Dictionary format is:
- * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
- * @throws IOException
- */
+ @Override
public void write(String baseDir) throws IOException {
- final String baseName = baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar);
- writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
- writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
- writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
- }
-
- protected void writeTargetMap(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
- CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
- out.writeVInt(targetMapSize);
- int nulls = 0;
- for (int j = 0; j < targetMapSize; j++) {
- final int size = targetMapComponentSizes[j];
- if (size == 0) {
- // run-length encoding for all nulls:
- if (nulls == 0) {
- out.writeVInt(0);
- }
- nulls++;
- } else {
- if (nulls > 0) {
- out.writeVInt(nulls);
- nulls = 0;
- }
- final int[] a = targetMap[j];
- assert size > 0 && size <= a.length;
- out.writeVInt(size);
- for (int i = 0; i < size; i++) {
- out.writeVInt(a[i]);
- }
- }
- }
- // write the pending RLE count:
- if (nulls > 0) {
- out.writeVInt(nulls);
- }
- } finally {
- os.close();
- }
- }
-
- protected void writePosDict(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
- CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
- out.writeVInt(posDict.size());
- for (String s : posDict) {
- out.writeString(s);
- }
- } finally {
- os.close();
- }
- System.out.println("Info: wrote " + posDict.size() + " unique POS entries");
- }
-
- protected void writeDictionary(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- final FileOutputStream os = new FileOutputStream(filename);
- try {
- final DataOutput out = new OutputStreamDataOutput(os);
- CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
- out.writeVInt(buffer.position());
- final WritableByteChannel channel = Channels.newChannel(os);
- // Write Buffer
- buffer.flip(); // set position to 0, set limit to current position
- channel.write(buffer);
- assert buffer.remaining() == 0L;
- } finally {
- os.close();
- }
+ super.write(baseDir);
+ writeDoubleArrayTrie(getBaseFileName(baseDir) + TokenInfoDictionary.TRIE_FILENAME_SUFFIX);
}
- public static void writeDoubleArrayTrie(String baseDir, DoubleArrayTrie trie) throws IOException {
- String filename = baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + TokenInfoDictionary.TRIE_FILENAME_SUFFIX;
+ protected void writeDoubleArrayTrie(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
-
final FileOutputStream os = new FileOutputStream(filename);
try {
trie.write(os);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java?rev=1228629&r1=1228628&r2=1228629&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java Sat Jan 7 15:04:21 2012
@@ -7,11 +7,11 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
-public class UnknownDictionaryWriter extends TokenInfoDictionaryWriter {
+public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
public UnknownDictionaryWriter(int size) {
- super(size);
+ super(UnknownDictionary.class, size);
}
@Override
@@ -42,17 +42,9 @@ public class UnknownDictionaryWriter ext
characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
}
- /**
- * Write dictionary in file
- * Dictionary format is:
- * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
- * @throws IOException
- */
+ @Override
public void write(String baseDir) throws IOException {
- final String baseName = baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar);
- writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
- writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
- writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+ super.write(baseDir);
characterDefinition.write(baseDir);
}
}