You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 17:39:16 UTC

svn commit: r1533517 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/dic/ java/org/apache/lucene/analysis/ko/tagging/ resources/org/apache/lucene/analysis/ko/dic/

Author: uschindler
Date: Fri Oct 18 15:39:16 2013
New Revision: 1533517

URL: http://svn.apache.org/r1533517
Log:
LUCENE-4956: Make parser more strict, remove bullshit from data files

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Fri Oct 18 15:39:16 2013
@@ -53,9 +53,11 @@ public class DictionaryUtil {
     try {
       final LineProcessor proc = new LineProcessor() {
         @Override
-        public void processLine(String line) {
+        public void processLine(String line) throws IOException {
           String[] infos = line.split("[,]+");
-          if(infos.length!=2) return;
+          if(infos.length!=2) {
+            throw new IOException("Invalid file format: "+line);
+          }
           infos[1] = infos[1].trim();
           if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
           
@@ -68,34 +70,41 @@ public class DictionaryUtil {
       
       DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
         @Override
-        public void processLine(String compound) {
+        public void processLine(String compound) throws IOException {
           String[] infos = compound.split("[:]+");
-          if(infos.length!=3&&infos.length!=2) return;
+          if(infos.length!=3 && infos.length!=2) {
+            throw new IOException("Invalid file format: "+compound);
+          }
           
           final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
           final WordEntry entry;
-          if(infos.length==2) 
+          if(infos.length==2) {
             entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
-          else 
+          } else { 
             entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
+          }
           dictionary.add(entry.getWord(), entry);          
         }       
       }); 
       
       DictionaryResources.readLines(DictionaryResources.FILE_ABBREV, new LineProcessor() {
         @Override
-        public void processLine(String abbrev) {
+        public void processLine(String abbrev) throws IOException {
           String[] infos = abbrev.split("[:]+");
-          if(infos.length!=2) return;      
+          if(infos.length!=2) {
+            throw new IOException("Invalid file format: "+abbrev);
+          }
           abbreviations.put(infos[0].trim(), infos[1].trim());          
         }
       });
       
       DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
         @Override
-        public void processLine(String compound) {
+        public void processLine(String compound) throws IOException {
           String[] infos = compound.split("[:]+");
-          if(infos.length!=2) return;
+          if(infos.length!=2) {
+            throw new IOException("Invalid file format: "+compound);
+          }
           WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
           uncompounds.put(entry.getWord(), entry);
         }
@@ -103,9 +112,11 @@ public class DictionaryUtil {
   
       DictionaryResources.readLines(DictionaryResources.FILE_CJ, new LineProcessor() {
         @Override
-        public void processLine(String cj) {
+        public void processLine(String cj) throws IOException {
           String[] infos = cj.split("[:]+");
-          if(infos.length!=2) return;
+          if(infos.length!=2) {
+            throw new IOException("Invalid file format: "+cj);
+          }
           cjwords.put(infos[0], infos[1]);
         }
       });
@@ -118,8 +129,8 @@ public class DictionaryUtil {
   
       readFileToSet(suffixs,DictionaryResources.FILE_SUFFIX);
       
-    } catch (IOException e) {      
-      new Error("Cannot load resource",e);
+    } catch (IOException e) {
+      throw new Error("Cannot load resource",e);
     }
   }
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 15:39:16 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ko.ta
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
@@ -47,7 +46,7 @@ public class Tagger {
         public void processLine(String str) throws IOException {
           String[] syls = str.split("[:]+");
           if(syls.length!=4)
-            throw new IOException("Invalid file format: "+Arrays.toString(syls));
+            throw new IOException("Invalid file format: "+str);
           
           final String key;        
           if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic Fri Oct 18 15:39:16 2013
@@ -13,5 +13,4 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-//#######
 거나:이/t,거나/e

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic Fri Oct 18 15:39:16 2013
@@ -13,5 +13,4 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-###################
 金融:금융
\ No newline at end of file

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-//#######
 거나
 거늘
 거니

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-//#######
 가
 같이
 같이나

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-####
 최
 ê³ 
 남

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
 ! See the License for the specific language governing permissions and
 ! limitations under the License.
 !
-#####
 각
 감
 값