You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 08:22:47 UTC
svn commit: r1534032 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko:
dic/DictionaryUtil.java dic/WordEntry.java morph/CompoundNounAnalyzer.java
Author: rmuir
Date: Mon Oct 21 06:22:46 2013
New Revision: 1534032
URL: http://svn.apache.org/r1534032
Log:
LUCENE-4956: don't use wordentry for uncompound processing
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 06:22:46 2013
@@ -41,7 +41,7 @@ public class DictionaryUtil {
private static final Set<String> suffixs = new HashSet<String>();;
- private static final Map<String,WordEntry> uncompounds = new HashMap<String,WordEntry>();
+ private static final Set<String> uncompounds = new HashSet<String>();
static {
try {
@@ -87,8 +87,7 @@ public class DictionaryUtil {
if(infos.length!=2) {
throw new IOException("Invalid file format: "+compound);
}
- WordEntry entry = new WordEntry(infos[0].trim(),"900000000X".toCharArray(), compoundArrayToList(infos[1], infos[1].split("[,]+")));
- uncompounds.put(entry.getWord(), entry);
+ uncompounds.add(infos[1]);
}
});
@@ -167,8 +166,9 @@ public class DictionaryUtil {
}
}
- public static WordEntry getUncompound(String key) {
- return uncompounds.get(key);
+ // TODO: make this more efficient later
+ public static boolean isUncompound(String before, String after) {
+ return uncompounds.contains(before + "," + after);
}
public static boolean existJosa(String str) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 06:22:46 2013
@@ -73,10 +73,9 @@ public class WordEntry {
this.word = word;
this.features = cs;
this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
- // has compound list iff compound feature is set ('2' in main dictionary, '9' in uncompounds)
- // TODO: implement validCompound check differently: uncompounds shouldnt use wordentry
- assert (features[IDX_NOUN] >= '2' && compounds != null && compounds.size() > 1)
- || (features[IDX_NOUN] <= '2' && compounds == null) : "inconsistent compound data for word: " + word;
+ // has compound list iff compound feature is set
+ assert (features[IDX_NOUN] == '2' && compounds != null && compounds.size() > 1)
+ || (features[IDX_NOUN] != '2' && compounds == null) : "inconsistent compound data for word: " + word;
}
public String getWord() {
@@ -90,7 +89,7 @@ public class WordEntry {
/** Returns true if entry is a compound noun */
public boolean isCompoundNoun() {
- return features[IDX_NOUN] >= '2';
+ return features[IDX_NOUN] == '2';
}
/** Returns List of compounds for word */
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1534032&r1=1534031&r2=1534032&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 21 06:22:46 2013
@@ -395,21 +395,15 @@ public class CompoundNounAnalyzer {
if(after.length()==1&&!isFirst&&!DictionaryUtil.existSuffix(after)) return false;
if(pos!=1&&before.length()==1) {
-
- WordEntry entry1 = DictionaryUtil.getUncompound(before+after);
- if(entry1!=null){
- List<CompoundEntry> compounds = entry1.getCompounds();
- if(before.equals(compounds.get(0).getWord())&&
- after.equals(compounds.get(1).getWord())) return false;
+ if (DictionaryUtil.isUncompound(before, after)) {
+ return false;
}
-
}
-
- WordEntry entry2 = after.length()==1 ? null : DictionaryUtil.getUncompound(after);
- if(entry2!=null){
- List<CompoundEntry> compounds = entry2.getCompounds();
- if("*".equals(compounds.get(0).getWord())&&
- after.equals(compounds.get(1).getWord())) return false;
+
+ if (after.length() != 1) {
+ if (DictionaryUtil.isUncompound("*", after)) {
+ return false;
+ }
}
return true;