You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/27 23:35:27 UTC
svn commit: r1536214 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/morph/ test/org/apache/lucene/analysis/ko/
Author: rmuir
Date: Sun Oct 27 22:35:26 2013
New Revision: 1536214
URL: http://svn.apache.org/r1536214
Log:
LUCENE-4956: more speedup,style,refactoring
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutputComparator.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/EomiUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestCompoundSegment.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestMorphologicalAnalyzer.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Sun Oct 27 22:35:26 2013
@@ -43,10 +43,10 @@ import org.apache.lucene.analysis.tokena
public final class KoreanFilter extends TokenFilter {
- private final LinkedList<Token> morphQueue = new LinkedList<Token>();;
- private final MorphAnalyzer morph = new MorphAnalyzer();
+ private final LinkedList<Token> morphQueue = new LinkedList<Token>();
+ private final MorphAnalyzer morph;
private final WordSpaceAnalyzer wsAnal = new WordSpaceAnalyzer();
- private final CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
+ private final CompoundNounAnalyzer cnAnalyzer;
private State currentState = null;
@@ -85,10 +85,11 @@ public final class KoreanFilter extends
public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch, boolean cnoun) {
super(input);
- cnAnalyzer.setExactMach(exactMatch);
this.bigrammable = bigram;
this.hasOrigin = has;
this.originCNoun = cnoun;
+ this.cnAnalyzer = new CompoundNounAnalyzer(exactMatch);
+ this.morph = new MorphAnalyzer(exactMatch);
}
public boolean incrementToken() throws IOException {
@@ -339,40 +340,32 @@ public final class KoreanFilter extends
// ì¶ì¶ë ëª
ì¬ê° ë³µí©ëª
ì¬ì¸ ê²½ì° ë¶ë¦¬íë¤.
for(int i=0;i<maxCandidate;i++) {
- List<CompoundEntry> results = confirmCNoun(candiList.get(i).toString());
+ List<CompoundEntry> results = cnAnalyzer.analyze(candiList.get(i).toString());
int pos = 0;
int offset = 0;
- for(CompoundEntry entry : results) {
- pos += entry.getWord().length();
- if(cnounMap.get(entry.getWord())!=null) continue;
+ if (results != null) {
+ for(CompoundEntry entry : results) {
+ pos += entry.getWord().length();
+ if(cnounMap.get(entry.getWord())!=null) continue;
- // íê¸ê³¼ 매ì¹ëë íì를 짤ë¼ì íì ì ì¥íë¤.
- // nocommit: this is avoiding AIOOBE, original code:
- // morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
- morphQueue.add(new Token(term.substring(offset,Math.min(pos, term.length())),offset));
- cnounMap.put(entry.getWord(), entry.getWord());
+ // íê¸ê³¼ 매ì¹ëë íì를 짤ë¼ì íì ì ì¥íë¤.
+ // nocommit: this is avoiding AIOOBE, original code:
+ // morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
+ morphQueue.add(new Token(term.substring(offset,Math.min(pos, term.length())),offset));
+ cnounMap.put(entry.getWord(), entry.getWord());
- if(entry.getWord().length()<2) continue; // íê¸ì 2ê¸ì ì´ìë§ ì ì¥íë¤.
+ if(entry.getWord().length()<2) continue; // íê¸ì 2ê¸ì ì´ìë§ ì ì¥íë¤.
- // ë¶ë¦¬ë íê¸ì íì ì ì¥íë¤.
- morphQueue.add(new Token(entry.getWord(),offset));
+ // ë¶ë¦¬ë íê¸ì íì ì ì¥íë¤.
+ morphQueue.add(new Token(entry.getWord(),offset));
- offset = pos;
- }
+ offset = pos;
+ }
+ }
}
}
- private List<CompoundEntry> confirmCNoun(String input) {
-
- WordEntry cnoun = DictionaryUtil.getAllNoun(input);
- if(cnoun!=null && cnoun.isCompoundNoun()) {
- return cnoun.getCompounds();
- }
-
- return cnAnalyzer.analyze(input);
- }
-
private boolean isAlphaNumChar(int c) {
if((c>=48&&c<=57)||(c>=65&&c<=122)) return true;
return false;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Sun Oct 27 22:35:26 2013
@@ -38,11 +38,11 @@ public class DictionaryUtil {
private static final Set<String> josas = new HashSet<String>();
- private static final Set<String> eomis = new HashSet<String>();;
+ private static final Set<String> eomis = new HashSet<String>();
- private static final Set<String> prefixs = new HashSet<String>();;
+ private static final Set<String> prefixs = new HashSet<String>();
- private static final Set<String> suffixs = new HashSet<String>();;
+ private static final Set<String> suffixs = new HashSet<String>();
private static final Set<String> uncompounds = new HashSet<String>();
@@ -127,6 +127,11 @@ public class DictionaryUtil {
return getWord(key, WordEntry.NOUN, WordEntry.COMPOUND);
}
+ /** Looks up a compound noun */
+ public static WordEntry getCompoundNoun(String key) {
+ return getWord(key, WordEntry.COMPOUND, 0);
+ }
+
/** return all noun including compound noun */
public static WordEntry getAllNoun(String key) {
return getWord(key, WordEntry.NOUN, 0);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java Sun Oct 27 22:35:26 2013
@@ -48,102 +48,112 @@ public class AnalysisOutput implements C
private int dicWordLen = 0; // the sum of the length of words within compound nouns
public AnalysisOutput(String stem, String josa, String eomi, int patn) {
- this.score = SCORE_ANALYSIS;
- this.stem=stem;
- this.josa = josa;
- this.eomi = eomi;
- this.patn = patn;
+ this(stem, josa, eomi, patn, SCORE_ANALYSIS);
}
public AnalysisOutput(String stem, String josa, String eomi, int patn, int score) {
- this(stem,josa,eomi,patn);
- this.score = score;
+ this(stem, josa, eomi, (char)0, patn, score);
}
public AnalysisOutput(String stem, String josa, String eomi, char pos, int patn, int score) {
- this(stem,josa,eomi,patn,score);
+ this.score = score;
+ this.stem = stem;
+ this.josa = josa;
+ this.eomi = eomi;
+ this.patn = patn;
this.pos = pos;
}
- public void setScore(int i) {
- this.score = i;
- }
- public void setPatn(int i) {
- this.patn = i;
+ public int getScore() {
+ return score;
}
- public void setStem(String s) {
- this.stem = s;
+ public void setScore(int score) {
+ this.score = score;
}
-
- public void setPos(char c) {
- this.pos = c;
+ public int getPatn() {
+ return patn;
}
- public void setNsfx(String s) {
- this.nsfx = s;
+ public void setPatn(int patn) {
+ this.patn = patn;
}
+
+ public String getStem() {
+ return stem;
+ }
- public void setJosa(String s) {
- this.josa = s;
+ public void setStem(String stem) {
+ this.stem = stem;
}
- public void setEomi(String s){
- this.eomi = s;
+ public char getPos() {
+ return pos;
}
- public void addElist(String l){
- this.elist.add(l);
- }
-
- public void setElist(String l, int index){
- this.elist.set(index,l);
+ public void setPos(char pos) {
+ this.pos = pos;
}
- public void setPomi(String s) {
- this.pomi = s;
+ public String getNsfx() {
+ return nsfx;
}
- public void setXverb(String s){
- this.xverb=s;
+
+ public void setNsfx(String nsfx) {
+ this.nsfx = nsfx;
}
- public void setVsfx(String s) {
- this.vsfx = s;
+
+ public String getJosa() {
+ return josa;
}
-
- public int getScore() {
- return this.score;
+
+ public void setJosa(String josa) {
+ this.josa = josa;
}
- public int getPatn() {
- return this.patn;
+
+ public String getEomi() {
+ return eomi;
}
-
- public String getStem() {
- return stem;
- }
- public char getPos() {
- return this.pos;
+
+ public void setEomi(String eomi) {
+ this.eomi = eomi;
}
- public String getNsfx() {
- return this.nsfx;
+
+ public List<String> getElist() {
+ return elist;
}
- public String getJosa() {
- return this.josa;
+
+ public void addElist(String element) {
+ elist.add(element);
}
- public String getEomi() {
- return this.eomi;
+
+ public void setElist(String element, int index) {
+ elist.set(index, element);
}
- public List<String> getElist() {
- return this.elist;
+
+ public String getPomi() {
+ return pomi;
}
- public String getPomi(){
- return this.pomi;
+
+ public void setPomi(String pomi) {
+ this.pomi = pomi;
}
+
public String getXverb() {
- return this.xverb;
+ return xverb;
+ }
+
+ public void setXverb(String xverb) {
+ this.xverb = xverb;
}
+
public String getVsfx() {
- return this.vsfx;
+ return vsfx;
+ }
+
+ public void setVsfx(String vsfx) {
+ this.vsfx = vsfx;
}
public int getMaxWordLen() {
@@ -162,19 +172,19 @@ public class AnalysisOutput implements C
this.dicWordLen = dicWordLen;
}
- public void addCNoun(CompoundEntry w) {
- compound.add(w);
- }
-
public List<CompoundEntry> getCNounList() {
return compound;
}
- public void setCNoun(List<CompoundEntry> cnoun) {
+ public void setCNounList(List<CompoundEntry> cnoun) {
compound = cnoun;
}
- public void addCNoun(List<CompoundEntry> cnoun) {
+ public void addCNoun(CompoundEntry entry) {
+ compound.add(entry);
+ }
+
+ public void addCNouns(List<CompoundEntry> cnoun) {
compound.addAll(cnoun);
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutputComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutputComparator.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutputComparator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutputComparator.java Sun Oct 27 22:35:26 2013
@@ -29,25 +29,31 @@ class AnalysisOutputComparator<T> implem
int pattern = out2.getPatn()-out1.getPatn();
int len = out1.getStem().length()-out2.getStem().length();
- if(score!=0) return score;
+ if (score != 0) {
+ return score;
+ }
- if(out2.getScore()==AnalysisOutput.SCORE_CORRECT &&
- out1.getScore()==AnalysisOutput.SCORE_CORRECT) {
- pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern;
- pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern;
+ if (out2.getScore() == AnalysisOutput.SCORE_CORRECT &&
+ out1.getScore() == AnalysisOutput.SCORE_CORRECT) {
+ pattern = out1.getPatn() == PatternConstants.PTN_N || out1.getPatn() == PatternConstants.PTN_AID ? -1 : pattern;
+ pattern = out2.getPatn() == PatternConstants.PTN_N || out2.getPatn() == PatternConstants.PTN_AID ? 1 : pattern;
}
- if(pattern!=0) return pattern;
+ if (pattern != 0) {
+ return pattern;
+ }
- if(out2.getScore()==AnalysisOutput.SCORE_COMPOUNDS &&
- out1.getScore()==AnalysisOutput.SCORE_COMPOUNDS) {
- if(out2.getMaxWordLen()!=out1.getMaxWordLen())
+ if (out2.getScore() == AnalysisOutput.SCORE_COMPOUNDS &&
+ out1.getScore() == AnalysisOutput.SCORE_COMPOUNDS) {
+ if (out2.getMaxWordLen() != out1.getMaxWordLen()) {
return out2.getMaxWordLen()-out1.getMaxWordLen();
- if(out2.getDicWordLen()!=out1.getDicWordLen())
+ }
+ if (out2.getDicWordLen() != out1.getDicWordLen()) {
return out2.getDicWordLen()-out1.getDicWordLen();
+ }
}
- if(out2.getPatn()==out1.getPatn()) {
+ if (out2.getPatn() == out1.getPatn()) {
len = out2.getStem().length()-out1.getStem().length();
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Sun Oct 27 22:35:26 2013
@@ -29,184 +29,158 @@ import org.apache.lucene.analysis.ko.dic
* ë³µí©ëª
ì¬ë¥¼ ë¶í´íë¤.
*/
public class CompoundNounAnalyzer {
+ private final boolean exactMatch;
- private boolean exactMach = true;
-
- public boolean isExactMach() {
- return exactMach;
- }
-
- public void setExactMach(boolean exactMach) {
- this.exactMach = exactMach;
+ public CompoundNounAnalyzer(boolean exactMatch) {
+ this.exactMatch = exactMatch;
}
+ /** Returns decompounded list for word, or null */
public List<CompoundEntry> analyze(String input) {
-
- WordEntry entry = DictionaryUtil.getAllNoun(input);
- if(entry!=null && entry.isCompoundNoun())
+ WordEntry entry = DictionaryUtil.getCompoundNoun(input);
+ if (entry != null) {
return entry.getCompounds();
-
- return analyze(input,true);
-
+ } else if (input.length() < 3) {
+ return null;
+ } else {
+ CompoundEntry[] compounds = analyze(input, true);
+ if (compounds == null) {
+ return null;
+ } else {
+ // nocommit
+ ArrayList<CompoundEntry> l = new ArrayList<CompoundEntry>();
+ l.addAll(Arrays.asList(compounds));
+ return l;
+ }
+ }
}
-
- public List<CompoundEntry> analyze(String input, boolean isFirst) {
-
- int len = input.length();
- if(len<3) return new ArrayList<CompoundEntry>();
-
- List<CompoundEntry> outputs = new ArrayList<CompoundEntry>();
-
- analyze(input, outputs, isFirst);
-
- return outputs;
+ private CompoundEntry[] analyze(String input, boolean isFirst) {
+ switch(input.length()) {
+ case 3: return analyze3Word(input, isFirst);
+ case 4: return analyze4Word(input, isFirst);
+ case 5: return analyze5Word(input, isFirst);
+ default:
+ List<CompoundEntry> outputs = new ArrayList<>();
+ boolean success = analyzeLongText(input, outputs, isFirst);
+ if (success) {
+ return outputs.toArray(new CompoundEntry[0]); // nocommit
+ } else {
+ return null;
+ }
+ }
}
-
- public boolean analyze(String input, List<CompoundEntry> outputs, boolean isFirst) {
-
- int len = input.length();
- boolean success = false;
-
- switch(len) {
- case 3 :
- success = analyze3Word(input,outputs,isFirst);
- break;
- case 4 :
- success = analyze4Word(input,outputs,isFirst);
- break;
- case 5 :
- success = analyze5Word(input,outputs,isFirst);
- break;
-// case 6 :
-// analyze6Word(input,outputs,isFirst);
-// break;
- default :
- success = analyzeLongText(input,outputs,isFirst);
- }
-
- return success;
- }
-
- private boolean analyze3Word(String input,List<CompoundEntry> outputs, boolean isFirst) {
-
- int[] units1 = {2,1};
- CompoundEntry[] entries1 = analysisBySplited(units1,input,isFirst);
- if(entries1!=null && entries1[0].isExist()&&entries1[1].isExist()) {
- outputs.addAll(Arrays.asList(entries1));
- return true;
- }
-
- int[] units2 = {1,2};
- CompoundEntry[] entries2 = analysisBySplited(units2,input,isFirst);
- if(entries2!=null && entries2[0].isExist()&&entries2[1].isExist()) {
- outputs.addAll(Arrays.asList(entries2));
- return true;
+
+ private static final int[] UNITS_1_2 = {1, 2};
+ private static final int[] UNITS_2_1 = {2, 1};
+
+ private CompoundEntry[] analyze3Word(String input, boolean isFirst) {
+ CompoundEntry[] entries = analysisBySplited(UNITS_2_1, input, isFirst);
+ if (entries != null && entries[0].isExist() && entries[1].isExist()) {
+ return entries;
+ }
+
+ entries = analysisBySplited(UNITS_1_2, input, isFirst);
+ if (entries !=null && entries[0].isExist() && entries[1].isExist()) {
+ return entries;
}
- return false;
+ return null;
}
- private boolean analyze4Word(String input,List<CompoundEntry> outputs, boolean isFirst) {
-
- if(!isFirst) {
- int[] units0 = {1,3};
- CompoundEntry[] entries0 = analysisBySplited(units0,input,isFirst);
- if(entries0!=null && entries0[0].isExist()&&entries0[1].isExist()) {
- outputs.addAll(Arrays.asList(entries0));
- return true;
+ private static final int[] UNITS_1_3 = {1, 3};
+ private static final int[] UNITS_2_2 = {2, 2};
+ private static final int[] UNITS_3_1 = {3, 1};
+ private static final int[] UNITS_1_2_1 = {1, 2, 1};
+
+ private CompoundEntry[] analyze4Word(String input, boolean isFirst) {
+ if (!isFirst) {
+ CompoundEntry[] entries = analysisBySplited(UNITS_1_3, input, false);
+ if (entries != null && entries[0].isExist() && entries[1].isExist()) {
+ return entries;
}
}
- int[] units3 = {3,1};
- CompoundEntry[] entries3 = analysisBySplited(units3,input,isFirst);
- if(entries3!=null && entries3[0].isExist()&&entries3[1].isExist()) {
- outputs.addAll(Arrays.asList(entries3));
- return true;
- }
-
- int[] units1 = {2,2};
- CompoundEntry[] entries1 = analysisBySplited(units1,input,isFirst);
- if(entries1!=null && entries1[0].isExist()&&entries1[1].isExist()) {
- outputs.addAll(Arrays.asList(entries1));
- return true;
- }
-
- int[] units2 = {1,2,1};
- CompoundEntry[] entries2 = analysisBySplited(units2,input,isFirst);
- if(entries2!=null && entries2[0].isExist()&&entries2[1].isExist()&&entries2[2].isExist()) {
- outputs.addAll(Arrays.asList(entries2));
- return true;
+ CompoundEntry[] entries3 = analysisBySplited(UNITS_3_1, input, isFirst);
+ if (entries3 != null && entries3[0].isExist() && entries3[1].isExist()) {
+ return entries3;
+ }
+
+ CompoundEntry[] entries2 = analysisBySplited(UNITS_2_2, input, isFirst);
+ if (entries2 != null && entries2[0].isExist() && entries2[1].isExist()) {
+ return entries2;
+ }
+
+ CompoundEntry[] entries1 = analysisBySplited(UNITS_1_2_1, input, isFirst);
+ if (entries1 != null && entries1[0].isExist() && entries1[1].isExist() && entries1[2].isExist()) {
+ return entries1;
}
-
- if(!exactMach&&entries1!=null && (entries1[0].isExist()||entries1[1].isExist())) {
- outputs.addAll(Arrays.asList(entries1));
- return true;
+ if (!exactMatch && entries2 != null && (entries2[0].isExist() || entries2[1].isExist())) {
+ return entries2;
}
- return false;
+ return null;
}
+
+ private static final int[] UNITS_2_3 = {2, 3};
+ private static final int[] UNITS_3_2 = {3, 2};
+ private static final int[] UNITS_4_1 = {4, 1};
+ private static final int[] UNITS_2_1_2 = {2, 1, 2};
+ private static final int[] UNITS_2_2_1 = {2, 2, 1};
- private boolean analyze5Word(String input,List<CompoundEntry> outputs, boolean isFirst) {
+ private CompoundEntry[] analyze5Word(String input, boolean isFirst) {
- int[] units1 = {2,3};
- CompoundEntry[] entries1 = analysisBySplited(units1,input,isFirst);
- if(entries1!=null && entries1[0].isExist()&&entries1[1].isExist()) {
- outputs.addAll(Arrays.asList(entries1));
- return true;
- }
-
- int[] units2 = {3,2};
- CompoundEntry[] entries2 = analysisBySplited(units2,input,isFirst);
- if(entries2!=null && entries2[0].isExist()&&entries2[1].isExist()) {
- outputs.addAll(Arrays.asList(entries2));
- return true;
- }
-
- int[] units_1 = {4,1};
- CompoundEntry[] entries_1 = analysisBySplited(units_1,input,isFirst);
- if(entries_1!=null && entries_1[0].isExist()&&entries_1[1].isExist()) {
- outputs.addAll(Arrays.asList(entries_1));
- return true;
- }
-
- int[] units3 = {2,2,1};
- CompoundEntry[] entries3 = analysisBySplited(units3,input,isFirst);
- if(entries3!=null && entries3[0].isExist()&&entries3[1].isExist()&&entries3[2].isExist()) {
- outputs.addAll(Arrays.asList(entries3));
- return true;
+ CompoundEntry[] entries1 = analysisBySplited(UNITS_2_3, input, isFirst);
+ if (entries1 != null && entries1[0].isExist() && entries1[1].isExist()) {
+ return entries1;
}
- int[] units4 = {2,1,2};
- CompoundEntry[] entries4 = analysisBySplited(units4,input,isFirst);
- if(entries4!=null && entries4[0].isExist()&&entries4[1].isExist()&&entries4[2].isExist()) {
- outputs.addAll(Arrays.asList(entries4));
- return true;
+ CompoundEntry[] entries2 = analysisBySplited(UNITS_3_2, input, isFirst);
+ if (entries2 != null && entries2[0].isExist() && entries2[1].isExist()) {
+ return entries2;
}
- if(!exactMach&&entries1!=null && (entries1[0].isExist()||entries1[1].isExist())) {
- outputs.addAll(Arrays.asList(entries1));
- return true;
+ CompoundEntry[] entries_1 = analysisBySplited(UNITS_4_1, input, isFirst);
+ if (entries_1 != null && entries_1[0].isExist() && entries_1[1].isExist()) {
+ return entries_1;
}
- if(!exactMach&&entries2!=null && (entries2[0].isExist()||entries2[1].isExist())) {
- outputs.addAll(Arrays.asList(entries2));
- return true;
+ CompoundEntry[] entries3 = analysisBySplited(UNITS_2_2_1, input, isFirst);
+ if (entries3 != null && entries3[0].isExist() && entries3[1].isExist() && entries3[2].isExist()) {
+ return entries3;
}
- boolean is = false;
- if(!exactMach&&entries3!=null && (entries3[0].isExist()||entries3[1].isExist())) {
- outputs.addAll(Arrays.asList(entries3));
- is = true;
+ CompoundEntry[] entries4 = analysisBySplited(UNITS_2_1_2, input, isFirst);
+ if (entries4 != null && entries4[0].isExist() && entries4[1].isExist() && entries4[2].isExist()) {
+ return entries4;
+ }
+
+ if (!exactMatch && entries1 != null && (entries1[0].isExist() || entries1[1].isExist())) {
+ return entries1;
+ }
+
+ if (!exactMatch && entries2 != null && (entries2[0].isExist() || entries2[1].isExist())) {
+ return entries2;
+ }
+
+ CompoundEntry[] res = null;
+ if (!exactMatch && entries3 != null && (entries3[0].isExist() || entries3[1].isExist())) {
+ res = entries3;
}
- if(!exactMach&&entries4!=null && (entries4[0].isExist()||entries4[2].isExist())) {
- outputs.addAll(Arrays.asList(entries4));
- is = true;
+ if (!exactMatch && entries4 != null && (entries4[0].isExist() || entries4[2].isExist())) {
+ if (res == null) {
+ res = entries4;
+ } else {
+ CompoundEntry[] both = new CompoundEntry[res.length + entries4.length];
+ System.arraycopy(res, 0, both, 0, res.length);
+ System.arraycopy(entries4, 0, both, res.length, entries4.length);
+ res = both;
+ }
}
- return is;
+ return res;
}
private boolean analyzeLongText(String input,List<CompoundEntry> outputs, boolean isFirst) {
@@ -244,8 +218,13 @@ public class CompoundNounAnalyzer {
boolean rSuccess = false;
WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
if(prvEntry==null) {
- pSucess = analyze(prev, results, false);
- if(!pSucess) results.add(new CompoundEntry(prev, false));
+ CompoundEntry res[] = analyze(prev, false);
+ if (res == null) {
+ results.add(new CompoundEntry(prev, false));
+ } else {
+ results.addAll(Arrays.asList(res));
+ pSucess = true;
+ }
} else {
pSucess = true;
if(prvEntry.isCompoundNoun())
@@ -256,8 +235,13 @@ public class CompoundNounAnalyzer {
WordEntry rearEntry = DictionaryUtil.getAllNoun(rear);
if(rearEntry==null) {
- rSuccess = analyze(rear, results, false);
- if(!rSuccess) results.add(new CompoundEntry(rear, false));
+ CompoundEntry res[] = analyze(rear, false);
+ if (res == null) {
+ results.add(new CompoundEntry(rear, false));
+ } else {
+ results.addAll(Arrays.asList(res));
+ rSuccess = true;
+ }
} else {
rSuccess = true;
if(rearEntry.isCompoundNoun())
@@ -310,32 +294,26 @@ public class CompoundNounAnalyzer {
* @return the max length
*/
private int maxWord(String text, boolean hasSuffix, String prvText) {
-
- int maxlen = 0;
- boolean existPrv = false;
-
- // if previous text exist in the dictionary.
- if(prvText.length()>=2)
- existPrv = (DictionaryUtil.getNoun(prvText.substring(prvText.length()-2))!=null);
- if(!existPrv&&prvText.length()>=3)
- existPrv = (DictionaryUtil.getNoun(prvText.substring(prvText.length()-3))!=null);
-
+
for(int i=text.length();i>1;i--) {
-
String seg = text.substring(0,i);
WordEntry entry = DictionaryUtil.getAllNoun(seg);
if(entry==null) continue;
- int len = 0;
- if(i==text.length()-1 && hasSuffix && !existPrv)
- len = i+1;
- else
- len = i;
-
- if(len>maxlen) maxlen = len;
+ if (i == text.length()-1 && hasSuffix) {
+ // if previous text exist in the dictionary.
+ boolean existPrv = false;
+ if(prvText.length()>=2)
+ existPrv = (DictionaryUtil.getNoun(prvText.substring(prvText.length()-2))!=null);
+ if(!existPrv&&prvText.length()>=3)
+ existPrv = (DictionaryUtil.getNoun(prvText.substring(prvText.length()-3))!=null);
+ return existPrv ? i : i+1;
+ } else {
+ return i;
+ }
}
- return maxlen;
+ return 0;
}
private CompoundEntry[] analysisBySplited(int[] units, String input, boolean isFirst) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/EomiUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/EomiUtil.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/EomiUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/EomiUtil.java Sun Oct 27 22:35:26 2013
@@ -174,62 +174,69 @@ class EomiUtil {
* 5. 'ì/ì´'ì ë³ì´ì²´ ë¶ë¦¬
*/
static String[] splitEomi(String stem, String end) {
-
- String[] strs = new String[2];
int strlen = stem.length();
- if(strlen==0) return strs;
-
+ if (strlen == 0) {
+ return null;
+ }
+
char estem = stem.charAt(strlen-1);
char[] chrs = MorphUtil.decompose(estem);
- if(chrs.length==1) return strs; // íê¸ì´ ìëë¼ë©´...
+ if (chrs.length==1) {
+ return null; // íê¸ì´ ìëë¼ë©´...
+ }
if((chrs.length==3)
&&(chrs[2]=='ã´'||chrs[2]=='ã¹'||chrs[2]=='ã
'||chrs[2]=='ã
')
&&EomiUtil.IsNLMBSyl(estem,chrs[2])
&& combineAndEomiCheck(chrs[2], end)!=null)
{
+ String strs[] = new String[2];
strs[1] = Character.toString(chrs[2]);
if(end.length()>0) strs[1] += end;
strs[0] = stem.substring(0,strlen-1) + MorphUtil.makeChar(estem, 0);
+ return strs;
}
else if(chrs.length==3 && chrs[2]=='ã¹' && DictionaryUtil.getVerb(stem)!=null
&& combineAndEomiCheck(chrs[2], end)!=null)
{
+ String strs[] = new String[2];
strs[1] = Character.toString(chrs[2]);
if(end.length()>0) strs[1] += end;
strs[0] = stem; // "ë§ë¤ ëìë"ìì "ë§ë¤"ê³¼ ê°ì ê²½ì°
+ return strs;
}
else if(estem=='í´'&&DictionaryUtil.existEomi("ì´"+end))
{
- strs[0] = stem.substring(0,strlen-1)+"í";
- strs[1] = "ì´"+end;
+ return new String[] { stem.substring(0,strlen-1)+"í", "ì´"+end };
}
else if(estem=='í'&&DictionaryUtil.existEomi("ì´"+end))
{
- strs[0] = stem.substring(0,strlen-1)+"í";
- strs[1] = "ì´"+end;
+ return new String[] { stem.substring(0,strlen-1)+"í", "ì´"+end };
}
else if(chrs[0]!='ã
'&&
(chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
')&&
(chrs.length==2 || SyllableFeatures.hasFeature(estem, SyllableFeatures.YNPAH)) &&
(combineAndEomiCheck('ì´', end)!=null))
{
- strs[0] = stem;
- if(chrs.length==2) strs[1] = "ì´"+end;
- else strs[1] = end;
+ if (chrs.length == 2) {
+ return new String[] { stem, "ì´"+end };
+ } else {
+ return new String[] { stem, end };
+ }
}
else if(estem=='í'&&end!=null&&end.startsWith("ì¬")&&
combineAndEomiCheck('ì´', end.substring(1))!=null)
{
- strs[0] = stem;
- strs[1] = "ì´"+end.substring(1);
+ return new String[] { stem, "ì´"+end.substring(1) };
}
else if(estem=='ë ¤'&&end!=null&& // êº¼ë ¤=>꺼리ì´, êº¼ë ¤ì=>꺼리ì´ì
combineAndEomiCheck('ì´', end)!=null)
{
- strs[0] = stem.substring(0,stem.length()-1)+"리";
- strs[1] = "ì´"+end;
+ return new String[] {
+ stem.substring(0,stem.length()-1)+"리",
+ "ì´"+end
+ };
}
else if((chrs.length==2)&&
(chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
')&&
@@ -253,19 +260,19 @@ class EomiUtil {
else if(chrs[1]=='ã
')
sb.append(MorphUtil.makeChar(estem, 20, 0)).append(MorphUtil.replaceJongsung('ì ',estem));
+ String strs[] = new String[2];
strs[0] = sb.toString();
end = strs[0].substring(strs[0].length()-1)+end;
strs[0] = strs[0].substring(0,strs[0].length()-1);
- strs[1] = end;
-
- } else if(!"".equals(end)&&DictionaryUtil.existEomi(end))
- {
- strs = new String[]{stem, end};
+ strs[1] = end;
+ return strs;
+ } else if(!"".equals(end)&&DictionaryUtil.existEomi(end)) {
+ return new String[]{stem, end};
+ } else {
+ return null;
}
-
- return strs;
}
/**
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java Sun Oct 27 22:35:26 2013
@@ -29,14 +29,10 @@ import org.apache.lucene.analysis.ko.dic
public class MorphAnalyzer {
- private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
+ private final CompoundNounAnalyzer cnAnalyzer;
- public MorphAnalyzer() {
- cnAnalyzer.setExactMach(false);
- }
-
- public void setExactCompound(boolean is) {
- cnAnalyzer.setExactMach(is);
+ public MorphAnalyzer(boolean exactMatch) {
+ cnAnalyzer = new CompoundNounAnalyzer(exactMatch);
}
/**
@@ -295,7 +291,7 @@ public class MorphAnalyzer {
output.setPos(PatternConstants.POS_ETC);
output.setPatn(PatternConstants.PTN_ADVJ);
}
- if(entry.isCompoundNoun()) output.addCNoun(entry.getCompounds());
+ if(entry.isCompoundNoun()) output.addCNouns(entry.getCompounds());
}else {
if(MorphUtil.hasVerbOnly(stem)) return;
}
@@ -320,7 +316,7 @@ public class MorphAnalyzer {
void analysisWithEomi(String stem, String end, List<AnalysisOutput> candidates) {
String[] morphs = EomiUtil.splitEomi(stem, end);
- if(morphs[0]==null) return; // ì´ë¯¸ê° ì¬ì ì ë±ë¡ëì´ ìì§ ìë¤ë©´....
+ if(morphs==null) return; // ì´ë¯¸ê° ì¬ì ì ë±ë¡ëì´ ìì§ ìë¤ë©´....
String[] pomis = EomiUtil.splitPomi(morphs[0]);
@@ -382,8 +378,8 @@ public class MorphAnalyzer {
boolean success = false;
- if(results.size()>1) {
- o.setCNoun(results);
+ if(results != null && results.size()>1) {
+ o.setCNounList(results);
success = true;
int maxWordLen = 0;
int dicWordLen = 0;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java Sun Oct 27 22:35:26 2013
@@ -56,7 +56,7 @@ class NounUtil {
}
String[] eomis = EomiUtil.splitEomi(start, end);
- if(eomis[0]==null) return false;
+ if(eomis==null) return false;
String[] pomis = EomiUtil.splitPomi(eomis[0]);
o.setStem(pomis[0]);
o.addElist(eomis[1]);
@@ -120,7 +120,7 @@ class NounUtil {
stomis = EomiUtil.splitEomi(eogan.substring(0,eogan.length()-1),eogan.substring(eogan.length()-1));
else
stomis = EomiUtil.splitEomi(eogan,"");
- if(stomis[0]==null) return false;
+ if(stomis==null) return false;
String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]);
if(irrs!=null) {
@@ -242,9 +242,9 @@ class NounUtil {
WordEntry cnoun = DictionaryUtil.getAllNoun(stem);
if(cnoun != null) {
if(cnoun.isCompoundNoun())
- output.setCNoun(cnoun.getCompounds());
+ output.setCNounList(cnoun.getCompounds());
else
- output.setCNoun(new ArrayList<CompoundEntry>()); // TODO: dont make all these lists
+ output.setCNounList(new ArrayList<CompoundEntry>()); // TODO: dont make all these lists
output.setScore(AnalysisOutput.SCORE_CORRECT);
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java Sun Oct 27 22:35:26 2013
@@ -101,7 +101,7 @@ class VerbUtil {
WordEntry entry = null;
if(success&&(entry=DictionaryUtil.getAllNoun(o.getStem()))!=null) {
if(entry.isCompoundNoun()) {
- o.setCNoun(entry.getCompounds());
+ o.setCNounList(entry.getCompounds());
}
o.setScore(AnalysisOutput.SCORE_CORRECT);
}
@@ -168,7 +168,7 @@ class VerbUtil {
else
stomis = EomiUtil.splitEomi(eogan,"");
- if(stomis[0]==null) return false;
+ if(stomis==null) return false;
o.addElist(stomis[1]);
int idxVbSfix = VerbUtil.endsWithVerbSuffix(stomis[0]);
@@ -190,7 +190,7 @@ class VerbUtil {
o.setScore(AnalysisOutput.SCORE_CORRECT);
if(entry.isCompoundNoun()) {
- o.setCNoun(entry.getCompounds());
+ o.setCNounList(entry.getCompounds());
}
candidates.add(o);
@@ -215,7 +215,7 @@ class VerbUtil {
success = true;
} else if(chrs.length>2&&chrs[2]=='ã
'){
String[] eres = EomiUtil.splitEomi(o.getStem().substring(0,strlen-1), "");
- if(eres[0]==null) return false;
+ if(eres==null) return false;
o.addElist(eres[1]);
String[] irrs = IrregularUtil.restoreIrregularVerb(eres[0], eres[1]);
@@ -259,10 +259,10 @@ class VerbUtil {
String[] stomis = null;
if(eogan.endsWith("ì")||eogan.endsWith("ì´")) {
stomis = EomiUtil.splitEomi(eogan.substring(0,eogan.length()-1),eogan.substring(eogan.length()-1));
- if(stomis[0]==null) return false;
+ if(stomis==null) return false;
}else {
stomis = EomiUtil.splitEomi(eogan, "");
- if(stomis[0]==null||!(stomis[1].startsWith("ì")||stomis[1].startsWith("ì´"))) return false;
+ if(stomis==null||!(stomis[1].startsWith("ì")||stomis[1].startsWith("ì´"))) return false;
}
String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Sun Oct 27 22:35:26 2013
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ko.mo
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -31,12 +30,7 @@ import org.apache.lucene.analysis.ko.dic
public class WordSpaceAnalyzer {
- private MorphAnalyzer morphAnal;
-
- public WordSpaceAnalyzer() {
- morphAnal = new MorphAnalyzer();
- morphAnal.setExactCompound(false);
- }
+ private final MorphAnalyzer morphAnal = new MorphAnalyzer(false);
public List<AnalysisOutput> analyze(String input) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestCompoundSegment.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestCompoundSegment.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestCompoundSegment.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestCompoundSegment.java Sun Oct 27 22:35:26 2013
@@ -34,7 +34,7 @@ public class TestCompoundSegment extends
*/
public void testSegmentCompound() throws Exception {
- CompoundNounAnalyzer analyzer = new CompoundNounAnalyzer();
+ CompoundNounAnalyzer analyzer = new CompoundNounAnalyzer(false);
assertArrayEquals(splitByUnitWord(analyzer, "ì°êµ¬ê°ë°ê³¼ì ")
,new String[]{"ì°êµ¬","ê°ë°","ê³¼ì "});
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestMorphologicalAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestMorphologicalAnalyzer.java?rev=1536214&r1=1536213&r2=1536214&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestMorphologicalAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestMorphologicalAnalyzer.java Sun Oct 27 22:35:26 2013
@@ -35,7 +35,7 @@ public class TestMorphologicalAnalyzer e
*/
public void testMorphAnalyze() throws Exception {
- MorphAnalyzer morphAnalyzer = new MorphAnalyzer();
+ MorphAnalyzer morphAnalyzer = new MorphAnalyzer(false);
assertEquals(extractStem(morphAnalyzer, "ëë°©ì"), "ëë°©");
@@ -85,7 +85,7 @@ public class TestMorphologicalAnalyzer e
*/
public void testCompoundNoun() throws Exception {
- MorphAnalyzer morphAnalyzer = new MorphAnalyzer();
+ MorphAnalyzer morphAnalyzer = new MorphAnalyzer(false);
assertArrayEquals(splitByUnitWord(morphAnalyzer, "ê³¼í기ì ì°êµ¬ê³¼ì ê°"),
new String[]{"ê³¼í","기ì ","ì°êµ¬","ê³¼ì "});