You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 03:21:28 UTC

svn commit: r1227028 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util: ConnectionCostsBuilder.java TokenInfoDictionaryBuilder.java UnknownDictionaryBuilder.java

Author: rmuir
Date: Wed Jan  4 02:21:28 2012
New Revision: 1227028

URL: http://svn.apache.org/viewvc?rev=1227028&view=rev
Log:
LUCENE-3305: be pedantic about charset decoding dictionary files

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java Wed Jan  4 02:21:28 2012
@@ -21,6 +21,9 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 
 import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
 
@@ -32,7 +35,11 @@ public class ConnectionCostsBuilder {
   
   public static ConnectionCosts build(String filename) throws IOException {
     FileInputStream inputStream = new FileInputStream(filename);
-    InputStreamReader streamReader = new InputStreamReader(inputStream);
+    Charset cs = Charset.forName("US-ASCII");
+    CharsetDecoder decoder = cs.newDecoder()
+        .onMalformedInput(CodingErrorAction.REPORT)
+        .onUnmappableCharacter(CodingErrorAction.REPORT);
+    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
     LineNumberReader lineReader = new LineNumberReader(streamReader);
     
     String line = lineReader.readLine();

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan  4 02:21:28 2012
@@ -23,6 +23,9 @@ import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map.Entry;
@@ -81,7 +84,11 @@ public class TokenInfoDictionaryBuilder 
     
     for (File file : csvFiles){
       FileInputStream inputStream = new FileInputStream(file);
-      InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+      Charset cs = Charset.forName(encoding);
+      CharsetDecoder decoder = cs.newDecoder()
+          .onMalformedInput(CodingErrorAction.REPORT)
+          .onUnmappableCharacter(CodingErrorAction.REPORT);
+      InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
       BufferedReader reader = new BufferedReader(streamReader);
       
       String line = null;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1227028&r1=1227027&r2=1227028&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Wed Jan  4 02:21:28 2012
@@ -22,6 +22,9 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 
 import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
 
@@ -55,7 +58,11 @@ public class UnknownDictionaryBuilder {
     UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
     
     FileInputStream inputStream = new FileInputStream(filename);
-    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+    Charset cs = Charset.forName(encoding);
+    CharsetDecoder decoder = cs.newDecoder()
+        .onMalformedInput(CodingErrorAction.REPORT)
+        .onUnmappableCharacter(CodingErrorAction.REPORT);
+    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
     LineNumberReader lineReader = new LineNumberReader(streamReader);
     
     dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));