You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 08:22:47 UTC
svn commit: r1534032 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko: dic/DictionaryUtil.java dic/WordEntry.java morph/CompoundNounAnalyzer.java

Author: rmuir
Date: Mon Oct 21 06:22:46 2013
New Revision: 1534032

URL: http://svn.apache.org/r1534032
Log:
LUCENE-4956: don't use wordentry for uncompound processing

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 06:22:46 2013
@@ -41,7 +41,7 @@ public class DictionaryUtil {
   
   private static final Set<String> suffixs = new HashSet<String>();;
   
-  private static final Map<String,WordEntry> uncompounds = new HashMap<String,WordEntry>();
+  private static final Set<String> uncompounds = new HashSet<String>();
   
   static {  
     try {
@@ -87,8 +87,7 @@ public class DictionaryUtil {
           if(infos.length!=2) {
             throw new IOException("Invalid file format: "+compound);
           }
-          WordEntry entry = new WordEntry(infos[0].trim(),"900000000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
-          uncompounds.put(entry.getWord(), entry);
+          uncompounds.add(infos[1]);
         }
       });
 
@@ -167,8 +166,9 @@ public class DictionaryUtil {
     }
   }
   
-  public static WordEntry getUncompound(String key) {
-    return uncompounds.get(key);
+  // TODO: make this more efficient later
+  public static boolean isUncompound(String before, String after) {
+    return uncompounds.contains(before + "," + after);
   }
   
   public static boolean existJosa(String str) {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 06:22:46 2013
@@ -73,10 +73,9 @@ public class WordEntry {
     this.word = word;
     this.features = cs;
     this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
-    // has compound list iff compound feature is set ('2' in main dictionary, '9' in uncompounds)
-    // TODO: implement validCompound check differently: uncompounds shouldnt use wordentry
-    assert (features[IDX_NOUN] >= '2' && compounds != null && compounds.size() > 1) 
-        || (features[IDX_NOUN] <= '2' && compounds == null) : "inconsistent compound data for word: " + word;
+    // has compound list iff compound feature is set
+    assert (features[IDX_NOUN] == '2' && compounds != null && compounds.size() > 1) 
+        || (features[IDX_NOUN] != '2' && compounds == null) : "inconsistent compound data for word: " + word;
   }
   
   public String getWord() {
@@ -90,7 +89,7 @@ public class WordEntry {
   
   /** Returns true if entry is a compound noun */
   public boolean isCompoundNoun() {
-    return features[IDX_NOUN] >= '2';
+    return features[IDX_NOUN] == '2';
   }
   
   /** Returns List of compounds for word */

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 21 06:22:46 2013
@@ -395,21 +395,15 @@ public class CompoundNounAnalyzer {
     if(after.length()==1&&!isFirst&&!DictionaryUtil.existSuffix(after)) return false;
 
     if(pos!=1&&before.length()==1) {
-      
-      WordEntry entry1 = DictionaryUtil.getUncompound(before+after);  
-      if(entry1!=null){
-        List<CompoundEntry> compounds = entry1.getCompounds();
-        if(before.equals(compounds.get(0).getWord())&&
-            after.equals(compounds.get(1).getWord())) return false;
+      if (DictionaryUtil.isUncompound(before, after)) {
+        return false;
       }
-      
     }
-
-    WordEntry entry2 = after.length()==1 ? null : DictionaryUtil.getUncompound(after);
-    if(entry2!=null){
-      List<CompoundEntry> compounds = entry2.getCompounds();      
-      if("*".equals(compounds.get(0).getWord())&&
-          after.equals(compounds.get(1).getWord())) return false;
+    
+    if (after.length() != 1) {
+      if (DictionaryUtil.isUncompound("*", after)) {
+        return false;
+      }
     }
     
     return true;