You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 03:21:28 UTC
svn commit: r1227028 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util:
ConnectionCostsBuilder.java TokenInfoDictionaryBuilder.java
UnknownDictionaryBuilder.java
Author: rmuir
Date: Wed Jan 4 02:21:28 2012
New Revision: 1227028
URL: http://svn.apache.org/viewvc?rev=1227028&view=rev
Log:
LUCENE-3305: be pedantic about charset decoding dictionary files
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java Wed Jan 4 02:21:28 2012
@@ -21,6 +21,9 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
@@ -32,7 +35,11 @@ public class ConnectionCostsBuilder {
public static ConnectionCosts build(String filename) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream);
+ Charset cs = Charset.forName("US-ASCII");
+ CharsetDecoder decoder = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = lineReader.readLine();
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan 4 02:21:28 2012
@@ -23,6 +23,9 @@ import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
@@ -81,7 +84,11 @@ public class TokenInfoDictionaryBuilder
for (File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+ Charset cs = Charset.forName(encoding);
+ CharsetDecoder decoder = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
BufferedReader reader = new BufferedReader(streamReader);
String line = null;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Wed Jan 4 02:21:28 2012
@@ -22,6 +22,9 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
@@ -55,7 +58,11 @@ public class UnknownDictionaryBuilder {
UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+ Charset cs = Charset.forName(encoding);
+ CharsetDecoder decoder = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));