You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 17:39:16 UTC
svn commit: r1533517 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/tagging/
resources/org/apache/lucene/analysis/ko/dic/
Author: uschindler
Date: Fri Oct 18 15:39:16 2013
New Revision: 1533517
URL: http://svn.apache.org/r1533517
Log:
LUCENE-4956: Make parser more strict, remove bullshit from data files
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Fri Oct 18 15:39:16 2013
@@ -53,9 +53,11 @@ public class DictionaryUtil {
try {
final LineProcessor proc = new LineProcessor() {
@Override
- public void processLine(String line) {
+ public void processLine(String line) throws IOException {
String[] infos = line.split("[,]+");
- if(infos.length!=2) return;
+ if(infos.length!=2) {
+ throw new IOException("Invalid file format: "+line);
+ }
infos[1] = infos[1].trim();
if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
@@ -68,34 +70,41 @@ public class DictionaryUtil {
DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
@Override
- public void processLine(String compound) {
+ public void processLine(String compound) throws IOException {
String[] infos = compound.split("[:]+");
- if(infos.length!=3&&infos.length!=2) return;
+ if(infos.length!=3 && infos.length!=2) {
+ throw new IOException("Invalid file format: "+compound);
+ }
final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
final WordEntry entry;
- if(infos.length==2)
+ if(infos.length==2) {
entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray(), c);
- else
+ } else {
entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"0X").toCharArray(), c);
+ }
dictionary.add(entry.getWord(), entry);
}
});
DictionaryResources.readLines(DictionaryResources.FILE_ABBREV, new LineProcessor() {
@Override
- public void processLine(String abbrev) {
+ public void processLine(String abbrev) throws IOException {
String[] infos = abbrev.split("[:]+");
- if(infos.length!=2) return;
+ if(infos.length!=2) {
+ throw new IOException("Invalid file format: "+abbrev);
+ }
abbreviations.put(infos[0].trim(), infos[1].trim());
}
});
DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
@Override
- public void processLine(String compound) {
+ public void processLine(String compound) throws IOException {
String[] infos = compound.split("[:]+");
- if(infos.length!=2) return;
+ if(infos.length!=2) {
+ throw new IOException("Invalid file format: "+compound);
+ }
WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
uncompounds.put(entry.getWord(), entry);
}
@@ -103,9 +112,11 @@ public class DictionaryUtil {
DictionaryResources.readLines(DictionaryResources.FILE_CJ, new LineProcessor() {
@Override
- public void processLine(String cj) {
+ public void processLine(String cj) throws IOException {
String[] infos = cj.split("[:]+");
- if(infos.length!=2) return;
+ if(infos.length!=2) {
+ throw new IOException("Invalid file format: "+cj);
+ }
cjwords.put(infos[0], infos[1]);
}
});
@@ -118,8 +129,8 @@ public class DictionaryUtil {
readFileToSet(suffixs,DictionaryResources.FILE_SUFFIX);
- } catch (IOException e) {
- new Error("Cannot load resource",e);
+ } catch (IOException e) {
+ throw new Error("Cannot load resource",e);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 15:39:16 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ko.ta
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
@@ -47,7 +46,7 @@ public class Tagger {
public void processLine(String str) throws IOException {
String[] syls = str.split("[:]+");
if(syls.length!=4)
- throw new IOException("Invalid file format: "+Arrays.toString(syls));
+ throw new IOException("Invalid file format: "+str);
final String key;
if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/abbreviation.dic Fri Oct 18 15:39:16 2013
@@ -13,5 +13,4 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-//#######
ê±°ë:ì´/t,ê±°ë/e
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/cj.dic Fri Oct 18 15:39:16 2013
@@ -13,5 +13,4 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-###################
ï¤è:ê¸ìµ
\ No newline at end of file
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/eomi.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-//#######
ê±°ë
ê±°ë
ê±°ë
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/josa.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-//#######
ê°
ê°ì´
ê°ì´ë
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/prefix.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-####
ìµ
ê³
ë¨
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic?rev=1533517&r1=1533516&r2=1533517&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/suffix.dic Fri Oct 18 15:39:16 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-#####
ê°
ê°
ê°