You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/05/05 05:40:06 UTC
svn commit: r1479234 [3/15] - in /lucene/dev/branches/lucene4956:
dev-tools/idea/.idea/ dev-tools/idea/lucene/analysis/arirang/
lucene/analysis/ lucene/analysis/arirang/ lucene/analysis/arirang/src/
lucene/analysis/arirang/src/java/ lucene/analysis/ari...
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java Sun May 5 03:39:51 2013
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.kr.morph;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class WordEntry {
+
+ public static final int IDX_NOUN = 0;
+ public static final int IDX_VERB = 1;
+ public static final int IDX_BUSA = 2;
+ public static final int IDX_DOV = 3;
+ public static final int IDX_BEV = 4;
+ public static final int IDX_NE = 5;
+ public static final int IDX_ADJ = 6; // íì©ì¬
+ public static final int IDX_NPR = 7; // ëª
ì¬ì ë¶ë¥ (M:Measure)
+ public static final int IDX_CNOUNX = 8;
+ public static final int IDX_REGURA = 9;
+
+ /**
+ * ë¨ì´
+ */
+ private String word;
+
+ /**
+ * ë¨ì´í¹ì±
+ */
+ private char[] features;
+
+ private List<CompoundEntry> compounds = new ArrayList();
+
+ public WordEntry() {
+
+ }
+
+ public WordEntry(String word) {
+ this.word = word;
+ }
+
+ public WordEntry(String word, char[] cs) {
+ this.word = word;
+ this.features = cs;
+ }
+
+ public WordEntry(String word, List c) {
+ this.word = word;
+ this.compounds = c;
+ }
+
+ public void setWord(String w) {
+ this.word = w;
+ }
+
+ public String getWord() {
+ return this.word;
+ }
+
+ public void setFeatures(char[] cs) {
+ this.features = cs;
+ }
+
+ public char getFeature(int index) {
+ if(features==null||features.length<index) return '0';
+ return features[index];
+ }
+
+ public char[] getFeatures() {
+ return this.features;
+ }
+
+ public void setCompounds(List<CompoundEntry> c) {
+ this.compounds = c;
+ }
+
+ public List<CompoundEntry> getCompounds() {
+ return this.compounds;
+ }
+
+}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java Sun May 5 03:39:51 2013
@@ -0,0 +1,589 @@
+package org.apache.lucene.analysis.kr.morph;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kr.utils.DictionaryUtil;
+import org.apache.lucene.analysis.kr.utils.MorphUtil;
+import org.apache.lucene.analysis.kr.utils.SyllableUtil;
+import org.apache.lucene.analysis.kr.utils.VerbUtil;
+
+public class WordSpaceAnalyzer {
+
+ private MorphAnalyzer morphAnal;
+
+ public WordSpaceAnalyzer() {
+ morphAnal = new MorphAnalyzer();
+ morphAnal.setExactCompound(false);
+ }
+
+ public List<AnalysisOutput> analyze(String input) throws MorphException {
+
+ List stack = new ArrayList();
+
+ WSOutput output = new WSOutput();
+
+ int wStart = 0;
+
+ int sgCount = -9;
+
+ Map<Integer, Integer> fCounter = new HashMap();
+
+ for(int i=0;i<input.length();i++) {
+
+ char[] f = SyllableUtil.getFeature(input.charAt(i));
+
+ String prefix = i==input.length()-1 ? "X" : input.substring(wStart,i+2);
+ Iterator iter = DictionaryUtil.findWithPrefix(prefix);
+
+ List<AnalysisOutput> candidates = new ArrayList();
+
+ WordEntry entry = null;
+
+ if(input.charAt(i)=='ì' || input.charAt(i)=='ì' || input.charAt(i)=='ì') {
+ addSingleWord(input.substring(wStart,i), candidates);
+
+
+ // ë¤ì ìì ì´ 2ìì ì´ì ë¨ì´ì í¬í¨ëì´ ìê³ ë§ì§ë§ ìì ì´ ìëë¼ë©´ ëìì°ê¸° ìì¹ê° ìë ê°ë¥ì±ì´ í¬ë¤.
+ // ë¶ì¬, ê´íì¬, ê°íì¬ ë± ë¨ì¼ì´ì¼ ê°ë¥ì±ì¸ ê²½ì° ëìì°ê¸°ê° ê°ë¥íë,
+ // ì´ ê²½ì°ë ë¤ì ìì ì ì¡°ì¬íì¬
+ } else if(i!= input.length()-1 && iter.hasNext()) {
+ // ì무ì§ë íì§ ìì.
+ sgCount = i;
+ } else if(!iter.hasNext() &&
+ (entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) {
+ candidates.add(buildSingleOutput(entry));
+
+ // í ìì ì´ ì¡°ì¬ë ì´ë¯¸ê° ììëë ìì ì¼ ê°ë¥ì±ì´ ìë¤ë©´...
+ } else if(f[SyllableUtil.IDX_EOGAN]=='1'||f[SyllableUtil.IDX_JOSA1]=='1'){
+ if(f[SyllableUtil.IDX_JOSA1]=='1')
+ candidates.addAll(anlysisWithJosa(input.substring(wStart), i-wStart));
+
+ if(f[SyllableUtil.IDX_EOGAN]=='1')
+ candidates.addAll(anlysisWithEomi(input.substring(wStart), i-wStart));
+ }
+
+ // í¸ë³´ê° ë ê°ë¥ì±ì´ ëì ìì¼ë¡ ì ë ¬íë¤.
+ Collections.sort(candidates, new WSOuputComparator());
+
+ // 길ì´ê° ê°ì¥ 긴 ë¨ì´ë¥¼ ë¨ì¼ì´ë¡ ì¶ê°íë¤.
+ appendSingleWord(candidates);
+
+ // ë¶ìì ì¤í¨í ë¨ì´ë¥¼
+ analysisCompouns(candidates);
+
+ // í¸ë³´ê° ë ê°ë¥ì±ì´ ëì ìì¼ë¡ ì ë ¬íë¤.
+ Collections.sort(candidates, new WSOuputComparator());
+
+ int reseult = validationAndAppend(output, candidates, input);
+ if(reseult==1) {
+ i = output.getLastEnd()-1;
+ wStart = output.getLastEnd();
+ } else if(reseult==-1) {
+ Integer index = fCounter.get(output.getLastEnd());
+ if(index==null) index = output.getLastEnd();
+ else index = index + 1;
+ i = index;
+ wStart = output.getLastEnd();
+ fCounter.put(output.getLastEnd(), index);
+ }
+
+ }
+
+ // ë¶ìì ì¤í¨íìë¤ë©´ ìë 문ìì´ì ëëë ¤ ì¤ë¤.
+ if(output.getLastEnd()<input.length()) {
+
+ String source = input.substring(output.getLastEnd());
+ int score = DictionaryUtil.getWord(source)==null ? AnalysisOutput.SCORE_ANALYSIS : AnalysisOutput.SCORE_CORRECT;
+ AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+ PatternConstants.PTN_N,score);
+
+ o.setSource(source);
+ output.getPhrases().add(o);
+ morphAnal.confirmCNoun(o);
+
+ }
+
+ return output.getPhrases();
+ }
+
+ /**
+ * ì¡°ì¬ë¡ ëëë ì´êµ¬ë¥¼ ë¶ìíë¤.
+ * @param snipt
+ * @param js
+ * @return
+ * @throws MorphException
+ */
+ private List<AnalysisOutput> anlysisWithJosa(String snipt, int js) throws MorphException {
+
+ List<AnalysisOutput> candidates = new ArrayList();
+ if(js<1) return candidates;
+
+ int jend = findJosaEnd(snipt, js);
+
+ if(jend==-1) return candidates; // íë¹í ì¡°ì¬ê° ìëë¼ë©´...
+
+ String input = snipt.substring(0,jend);
+
+ boolean josaFlag = true;
+
+ for(int i=input.length()-1;i>0;i--) {
+
+ String stem = input.substring(0,i);
+
+ String josa = input.substring(i);
+
+ char[] feature = SyllableUtil.getFeature(josa.charAt(0));
+
+ if(josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') {
+ morphAnal.analysisWithJosa(stem,josa,candidates);
+ }
+
+ if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
+
+ if(!josaFlag) break;
+
+ }
+
+ if(input.length()==1) {
+ AnalysisOutput o =new AnalysisOutput(input,null,null,PatternConstants.POS_NOUN,
+ PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+ candidates.add(o);
+ }
+
+ fillSourceString(input, candidates);
+
+ return candidates;
+ }
+
+ /**
+ * ì¡°ì¬ì 첫ìì ë¶í° ì¡°ì¬ì 2ìì ì´ìì ì¬ì©ë ì ìë ìì ì ì¡°ì¬íì¬
+ * ê°ì¥ í° ì¡°ì¬ë¥¼ ì°¾ëë¤.
+ * @param snipt
+ * @param jstart
+ * @return
+ * @throws MorphException
+ */
+ private int findJosaEnd(String snipt, int jstart) throws MorphException {
+
+ int jend = jstart;
+
+ // [ê²ì]ì´ ëª
ì¬ë¥¼ ì´ë£¨ë ê²½ì°ë ìë¤.
+ if(snipt.charAt(jstart-1)=='ê²'&&(snipt.charAt(jstart)=='ì')) return jstart+1;
+
+ if(snipt.length()>jstart+2&&snipt.charAt(jstart+1)=='ì¤') { // ì¬ëì¤ë¬ì´, ìëì¤ë¬ì´ ê°ì ê²½ì°ë¥´ ì²ë¦¬í¨.
+ char[] chrs = MorphUtil.decompose(snipt.charAt(jstart+2));
+
+ if(chrs.length>=2&&chrs[0]=='ã¹'&&chrs[1]=='ã
') return -1;
+ }
+
+ // ì¡°ì¬ì 2ìì ë¡ ì¬ì©ë ì ë§ì§ë§ ìì ì ì°¾ëë¤.
+ for(int i=jstart+1;i<snipt.length();i++) {
+ char[] f = SyllableUtil.getFeature(snipt.charAt(i));
+ if(f[SyllableUtil.IDX_JOSA2]=='0') break;
+ jend = i;
+ }
+
+ int start = jend;
+ boolean hasJosa = false;
+ for(int i=start;i>=jstart;i--) {
+ String str = snipt.substring(jstart,i+1);
+ if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snipt,i,i+2) &&
+ !isNounPart(snipt,jstart)) {
+ jend = i;
+ hasJosa = true;
+ break;
+ }
+ }
+
+ if(!hasJosa) return -1;
+
+ return jend+1;
+
+ }
+
+ /**
+ * í¥í ê³ì°ì´ë ì 문ìì´ì ë³´ì¬ì£¼ê¸° ìí´ source string ì ì ì¥íë¤.
+ * @param source
+ * @param candidates
+ */
+ private void fillSourceString(String source, List<AnalysisOutput> candidates) {
+
+ for(AnalysisOutput o : candidates) {
+ o.setSource(source);
+ }
+
+ }
+
+ /**
+ * 목ë¡ì 1ë²ì§ê° ê°ì¥ í° ê¸¸ì´ë¥¼ ê°ì§ë¤.
+ * @param candidates
+ */
+ private void appendSingleWord(List<AnalysisOutput> candidates) throws MorphException {
+
+ if(candidates.size()==0) return;
+
+ String source = candidates.get(0).getSource();
+
+ WordEntry entry = DictionaryUtil.getWordExceptVerb(source);
+
+ if(entry!=null) {
+ candidates.add(buildSingleOutput(entry));
+ } else {
+
+ if(candidates.get(0).getPatn()>PatternConstants.PTN_VM&&
+ candidates.get(0).getPatn()<=PatternConstants.PTN_VMXMJ) return;
+
+ if(source.length()<5) return;
+
+ AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+ PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+ o.setSource(source);
+ morphAnal.confirmCNoun(o);
+ if(o.getScore()==AnalysisOutput.SCORE_CORRECT) candidates.add(o);
+ }
+ }
+
+ private void addSingleWord(String source, List<AnalysisOutput> candidates) throws MorphException {
+
+ WordEntry entry = DictionaryUtil.getWordExceptVerb(source);
+
+ if(entry!=null) {
+ candidates.add(buildSingleOutput(entry));
+ } else {
+ AnalysisOutput o =new AnalysisOutput(source,null,null,PatternConstants.POS_NOUN,
+ PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+ o.setSource(source);
+ morphAnal.confirmCNoun(o);
+ candidates.add(o);
+ }
+
+// Collections.sort(candidates, new WSOuputComparator());
+
+ }
+
+ private List anlysisWithEomi(String snipt, int estart) throws MorphException {
+
+ List<AnalysisOutput> candidates = new ArrayList();
+
+ int eend = findEomiEnd(snipt,estart);
+
+ // ëì¬ìì ëª
ì¬ë¶ë¦¬
+ int vstart = 0;
+ for(int i=estart-1;i>=0;i--) {
+ Iterator iter = DictionaryUtil.findWithPrefix(snipt.substring(i,estart));
+ if(iter.hasNext()) vstart=i;
+ else break;
+ }
+
+ if(snipt.length()>eend &&
+ DictionaryUtil.findWithPrefix(snipt.substring(vstart,eend+1)).hasNext())
+ return candidates; // ë¤ììì ê¹ì§ ë¨ì´ì ì¼ë¶ë¼ë©´.. ë¶í´ë¥¼ ìíë¤.
+
+ String pvword = null;
+ if(vstart!=0) pvword = snipt.substring(0,vstart);
+
+ while(true) { // ã¹,ã
,ã´ ì´ê¸°ë문ì ì´ë¯¸ìì¹ë¥¼ ë¤ë¡ ì¡ìëë°, ì©ì¸+ì´ë¯¸ì ííê° ìëë¼ë©´.. ì´êµ¬ ëì íë ì¤ì¸ë¤.
+ String input = snipt.substring(vstart,eend);
+ anlysisWithEomiDetail(input, candidates);
+ if(candidates.size()==0) break;
+ if(("ã¹".equals(candidates.get(0).getEomi()) ||
+ "ã
".equals(candidates.get(0).getEomi()) ||
+ "ã´".equals(candidates.get(0).getEomi())) &&
+ eend>estart+1 && candidates.get(0).getPatn()!=PatternConstants.PTN_VM &&
+ candidates.get(0).getPatn()!=PatternConstants.PTN_NSM
+ ) {
+ eend--;
+ }else if(pvword!=null&&candidates.get(0).getPatn()>=PatternConstants.PTN_VM&& // ëª
ì¬ + ì©ì¸ ì´êµ¬ ì¤ì.. ì©ì¸ì´êµ¬ë¡ ë¨ì´ë¥¼ ì´ë£¨ë ê²½ì°ë ìë¤.
+ candidates.get(0).getPatn()<=PatternConstants.PTN_VMXMJ && DictionaryUtil.getWord(input)!=null){
+ candidates.clear();
+ break;
+ }else if(pvword!=null&&VerbUtil.verbSuffix(candidates.get(0).getStem())
+ &&DictionaryUtil.getNoun(pvword)!=null){ // ëª
ì¬ + ì©ì¸í ì ë¯¸ì¬ + ì´ë¯¸ ì²ë¦¬
+ candidates.clear();
+ anlysisWithEomiDetail(snipt.substring(0,eend), candidates);
+ pvword=null;
+ break;
+ } else {
+ break;
+ }
+ }
+
+ if(candidates.size()>0&&pvword!=null) {
+ AnalysisOutput o =new AnalysisOutput(pvword,null,null,PatternConstants.POS_NOUN,
+ PatternConstants.PTN_N,AnalysisOutput.SCORE_ANALYSIS);
+ morphAnal.confirmCNoun(o);
+
+ List<CompoundEntry> cnouns = o.getCNounList();
+ if(cnouns.size()==0) {
+ boolean is = DictionaryUtil.getWordExceptVerb(pvword)!=null;
+ cnouns.add(new CompoundEntry(pvword,0,is));
+ }
+
+ for(AnalysisOutput candidate : candidates) {
+ candidate.getCNounList().addAll(cnouns);
+ candidate.getCNounList().add(new CompoundEntry(candidate.getStem(),0,true));
+ candidate.setStem(pvword+candidate.getStem()); // ì´ë ê² í´ì¼ WSOutput ì ë³µí©ëª
ì¬ ì²ë¦¬í ë ì ìì²ë¦¬ë¨
+ }
+
+ }
+
+ fillSourceString(snipt.substring(0,eend), candidates);
+
+ return candidates;
+ }
+
+ private void anlysisWithEomiDetail(String input, List<AnalysisOutput> candidates )
+ throws MorphException {
+
+ boolean eomiFlag = true;
+
+ int strlen = input.length();
+
+ char ch = input.charAt(strlen-1);
+ char[] feature = SyllableUtil.getFeature(ch);
+
+ if(feature[SyllableUtil.IDX_YNPNA]=='1'||feature[SyllableUtil.IDX_YNPLA]=='1'||
+ feature[SyllableUtil.IDX_YNPMA]=='1')
+ morphAnal.analysisWithEomi(input,"",candidates);
+
+ for(int i=strlen-1;i>0;i--) {
+
+ String stem = input.substring(0,i);
+ String eomi = input.substring(i);
+
+ feature = SyllableUtil.getFeature(eomi.charAt(0));
+
+ if(eomiFlag) {
+ morphAnal.analysisWithEomi(stem,eomi,candidates);
+ }
+
+ if(eomiFlag&&feature[SyllableUtil.IDX_EOMI2]=='0') eomiFlag = false;
+
+ if(!eomiFlag) break;
+ }
+
+ }
+
+ /**
+ * ì´ë¯¸ì 첫ìì ë¶í° ì´ë¯¸ì 1ìì ì´ìì ì¬ì©ë ì ìë ìì ì ì¡°ì¬íì¬
+ * ê°ì¥ í° ì¡°ì¬ë¥¼ ì°¾ëë¤.
+ * @param snipt
+ * @param jstart
+ * @return
+ * @throws MorphException
+ */
+ private int findEomiEnd(String snipt, int estart) throws MorphException {
+
+ int jend = 0;
+
+ String tail = null;
+ char[] chr = MorphUtil.decompose(snipt.charAt(estart));
+ if(chr.length==3 && (chr[2]=='ã´')) {
+ tail = 'ì'+snipt.substring(estart+1);
+ }else if(chr.length==3 && (chr[2]=='ã¹')) {
+ tail = 'ì'+snipt.substring(estart+1);
+ }else if(chr.length==3 && (chr[2]=='ã
')) {
+ tail = 'ìµ'+snipt.substring(estart+1);
+ }else {
+ tail = snipt.substring(estart);
+ }
+
+ // ì¡°ì¬ì 2ìì ë¡ ì¬ì©ë ì ë§ì§ë§ ìì ì ì°¾ëë¤.
+ int start = 0;
+ for(int i=1;i<tail.length();i++) {
+ char[] f = SyllableUtil.getFeature(tail.charAt(i));
+ if(f[SyllableUtil.IDX_EOGAN]=='0') break;
+ start = i;
+ }
+
+ for(int i=start;i>0;i--) { // ì°¾ì ì ìëë¼ë 1ìì ì ë°ëì ë°íí´ì¼ íë¤.
+ String str = tail.substring(0,i+1);
+ char[] chrs = MorphUtil.decompose(tail.charAt(i));
+ if(DictionaryUtil.existEomi(str) ||
+ (i<2&&chrs.length==3&&(chrs[2]=='ã¹'||chrs[2]=='ã
'||chrs[2]=='ã´'))) { // ã
,ã¹,ã´ì´ ì°ìë ì©ì¸ì ìë¤, ì¬ì ì ë³´ê³ íì¸ì í´ë³´ì
+ jend = i;
+ break;
+ }
+ }
+
+ return estart+jend+1;
+
+ }
+
+ /**
+ * validation í íë³´ê° ë ê°ë¥ì±ì´ ëì ìµìì ê²ì ê²°ê³¼ì ì¶ê°íë¤.
+ *
+ * @param output
+ * @param candidates
+ * @param stack
+ */
+ private int validationAndAppend(WSOutput output, List<AnalysisOutput> candidates, String input)
+ throws MorphException {
+
+ if(candidates.size()==0) return 0;
+
+ AnalysisOutput o = candidates.remove(0);
+ AnalysisOutput po = output.getPhrases().size()>0 ? output.getPhrases().get(output.getPhrases().size()-1) : null;
+
+ String ejend = o.getSource().substring(o.getStem().length());
+
+ char[] chrs = po!=null&&po.getStem().length()>0 ? MorphUtil.decompose(po.getStem().charAt(po.getStem().length()-1)) : null;
+ String pjend = po!=null&&po.getStem().length()>0 ? po.getSource().substring(po.getStem().length()) : null;
+
+ char ja = 'x'; // ììì 문ì
+ if(po!=null&&(po.getPatn()==PatternConstants.PTN_VM||po.getPatn()==PatternConstants.PTN_VMCM||po.getPatn()==PatternConstants.PTN_VMXM)) {
+ char[] chs = MorphUtil.decompose(po.getEomi().charAt(po.getEomi().length()-1));
+ if(chs.length==3) ja=chs[2];
+ else if(chs.length==1) ja=chs[0];
+ }
+
+ int nEnd = output.getLastEnd()+o.getSource().length();
+
+ char[] f = nEnd<input.length() ? SyllableUtil.getFeature(input.charAt(nEnd)) : null;
+
+ // ë°¥ë¨¹ê³ ê°ì ê²½ì°ê° ê°ë¥íë.. ë¨¹ê³ ë ëª
ì¬ê° ìëë¤.
+ if(po!=null&&po.getPatn()==PatternConstants.PTN_N&&candidates.size()>0&&
+ o.getPatn()==PatternConstants.PTN_VM&&candidates.get(0).getPatn()==PatternConstants.PTN_N) {
+ o = candidates.remove(0);
+ }else if(po!=null&&po.getPatn()>=PatternConstants.PTN_VM&&candidates.size()>0&&
+ candidates.get(0).getPatn()==PatternConstants.PTN_N&&
+ (ja=='ã´'||ja=='ã¹')) { // ë¤ë
ê°ã´, ì¬,ë(e) ë¡ ë¶í´ ë°©ì§
+ o = candidates.remove(0);
+ }
+
+ //=============================================
+ if(o.getPos()==PatternConstants.POS_NOUN && MorphUtil.hasVerbOnly(o.getStem())) {
+ output.removeLast();
+ return -1;
+ }else if(nEnd<input.length() && f[SyllableUtil.IDX_JOSA1]=='1'
+ && DictionaryUtil.getNoun(o.getSource())!=null) {
+ return -1;
+ }else if(nEnd<input.length() && o.getScore()==AnalysisOutput.SCORE_ANALYSIS
+ && DictionaryUtil.findWithPrefix(ejend+input.charAt(nEnd)).hasNext()) { // 루ì¬íã´ ê¸ííìë¶ì기 ë°©ì§
+ return -1;
+ }else if(po!=null&&po.getPatn()==PatternConstants.PTN_VM&&"ã
".equals(po.getEomi())&&
+ o.getStem().equals("í")) { // ë¤ì§ í©ëë¤ ë¡ ë¶ë¦¬ëë ê² ë°©ì§
+ output.removeLast();
+ return -1;
+ }else if(po!=null&&po.getPatn()==PatternConstants.PTN_N&&VerbUtil.verbSuffix(o.getStem())&&
+ !"ì".equals(o.getStem())) { // ì¬ëë°ë¤, ì¬ëì¤ë¬ì´ì ì²ë¦¬, ê·¸ë¬ë ìì ì ë¨ì´ì ê²°í©íì§ ìëë¤.
+ output.removeLast();
+ return -1;
+ } else {
+ output.addPhrase(o);
+ }
+
+ return 1;
+ }
+
+
+ private AnalysisOutput buildSingleOutput(WordEntry entry) {
+
+ char pos = PatternConstants.POS_NOUN;
+
+ int ptn = PatternConstants.PTN_N;
+
+ if(entry.getFeature(WordEntry.IDX_NOUN)=='0') {
+ pos = PatternConstants.POS_AID;
+ ptn = PatternConstants.PTN_AID;
+ }
+
+ AnalysisOutput o = new AnalysisOutput(entry.getWord(),null,null,pos,
+ ptn,AnalysisOutput.SCORE_CORRECT);
+
+ o.setSource(entry.getWord());
+
+ return o;
+ }
+
+ private void analysisCompouns(List<AnalysisOutput> candidates) throws MorphException {
+
+ // ë³µí©ëª
ì¬ ë¶í´ì¬ë¶ ê²°ì íì¬ ë¶í´
+ boolean changed = false;
+ boolean correct = false;
+ for(AnalysisOutput o:candidates) {
+
+ if(o.getScore()==AnalysisOutput.SCORE_CORRECT) {
+ if(o.getPatn()!=PatternConstants.PTN_NJ) correct=true;
+ // "íì±íí´"ê° [íì±í(N),í(t),ì´ì¼(e)] ë¶ìì±ê³µíìëë° [íì±/íí´]ë¶í´ëë ê²ì ë°©ì§
+ if("í".equals(o.getVsfx())) break;
+ continue;
+ }
+
+ if(o.getPatn()<=PatternConstants.PTN_VM&&o.getStem().length()>2) {
+ if(!(correct&&o.getPatn()==PatternConstants.PTN_N)) morphAnal.confirmCNoun(o);
+ if(o.getScore()==AnalysisOutput.SCORE_CORRECT) changed=true;
+ }
+ }
+
+ }
+
+ /**
+ * 문ìì´ì
+ * @param str ë¶ìíê³ ì íë ì ì²´ 문ìì´
+ * @param ws 문ìì´ìì ëª
ì¬ë¥¼ ì°¾ë ìììì¹
+ * @param es 문ìì´ìì ëª
ì¬ë¥¼ ì°¾ë ë ìì¹
+ * @return
+ * @throws MorphException
+ */
+ private boolean findNounWithinStr(String str, int ws, int es) throws MorphException {
+
+ if(str.length()<es) return false;
+
+ for(int i=es;i<str.length();i++) {
+ char[] f = SyllableUtil.getFeature(str.charAt(i));
+ if(i==str.length() || (f[SyllableUtil.IDX_JOSA1]=='1')) {
+ return (DictionaryUtil.getWord(str.substring(ws,i))!=null);
+ }
+ }
+
+ return false;
+ }
+
+ private boolean isNounPart(String str, int jstart) throws MorphException {
+
+ if(true) return false;
+
+ for(int i=jstart-1;i>=0;i--) {
+ if(DictionaryUtil.getWordExceptVerb(str.substring(i,jstart+1))!=null)
+ return true;
+
+ }
+
+
+ return false;
+
+ }
+
+ private void printCandidate(WSOutput output) {
+
+ List<AnalysisOutput> os = output.getPhrases();
+ for(AnalysisOutput o : os) {
+ System.out.print(o.toString()+"("+o.getScore()+")| ");
+ }
+ System.out.println("<==");
+
+ }
+}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java Sun May 5 03:39:51 2013
@@ -0,0 +1,317 @@
+package org.apache.lucene.analysis.kr.tagging;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+import org.apache.lucene.analysis.kr.utils.ConstraintUtil;
+import org.apache.lucene.analysis.kr.utils.FileUtil;
+import org.apache.lucene.analysis.kr.utils.KoreanEnv;
+import org.apache.lucene.analysis.kr.utils.StringUtil;
+import org.apache.lucene.analysis.kr.utils.Trie;
+
+
+/**
+ * ì¬ë¬ê°ì ííìë¶ì ê²°ê³¼ ì¤ì ìµì ì ê²ì ì ííë¤.
+ * ì´ í¨ìë 문ì¥ë¨ìë¡ í¸ì¶ëì´ì¼ íë¤.
+ */
+public class Tagger {
+
+ private static Trie<String, String[]> occurrences;
+
+ private static final String tagDicLoc = "tagger.dic";
+
+ private static final String NILL = "NILL";
+
+ private static final String NOPATN = "0";
+
+ private AnalysisOutput po;
+
+ public AnalysisOutput tagging(String psource, List<AnalysisOutput> pmorphs) throws MorphException {
+
+ return tagging(psource, null, pmorphs, null);
+
+ }
+
+ public AnalysisOutput tagging(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
+
+ if((pmorphs==null||pmorphs.size()==0)&&(rmorphs==null||rmorphs.size()==0)) return null;
+
+ po = lookupBest(psource, rsource, pmorphs, rmorphs);
+
+ po.setSource(psource);
+
+ return po;
+
+ }
+
+ /**
+ * poê° NULLì´ ìë ê²½ì°ë§ í¸ì¶ëë¤.
+ * occurrence.dic ì ë±ë¡ëì´ ìë ê²½ì°ë§.. ìµì ì ì°¾ìì ë°ííë¤.
+ * 1. 첫ë²ì§¸ë ì´ê°ì¼ë¡ ììëë ë¬¸ë² ê·ì¹ì ì°¾ëë¤.
+ * 2. ëë²ì§¸ë í층íì¼ë¡ ììëë 문ë²ê·ì¹ì ì°¾ëë¤.
+ * @param morphs
+ * @return
+ */
+ private AnalysisOutput lookupBest(String psource,String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
+
+ if(pmorphs.size()==1) return pmorphs.get(0);
+
+ AnalysisOutput select = null;
+ if(rmorphs!=null&&rmorphs.size()!=0) select = lookupBestByRWord(psource, rsource, pmorphs, rmorphs);
+ if(select!=null) return select;
+
+ if(po!=null) select = lookupBestByPWord(psource, pmorphs);
+
+ if(select!=null) return select;
+
+ return pmorphs.get(0);
+ }
+
+ /**
+ * ì ì´ì ì ìí´ íì¬ ì´ì ì ê²°ì íë¤.
+ * ì ì´ì ì NULLì´ ìëë¤.
+ * @param source
+ * @param pmorphs
+ * @param rmorphs
+ * @return
+ * @throws MorphException
+ */
+ private AnalysisOutput lookupBestByPWord(String rsource, List<AnalysisOutput> rmorphs) throws MorphException {
+
+
+ List<AnalysisOutput> removes = new ArrayList();
+
+ for(AnalysisOutput morph : rmorphs) {
+
+ Iterator<String[]> iterw = getGR("F"+rsource+"^W");
+
+ AnalysisOutput best = selectBest(iterw, po.getSource(), rsource, po, morph, true, removes);
+ if(best!=null) return best;
+
+ Iterator<String[]> iters = getGR("F"+morph.getStem()+"^S");
+ best = selectBest(iters, po.getSource(), rsource, po, morph, true, removes);
+ if(best!=null) return best;
+
+ }
+
+ for(AnalysisOutput morph : removes) {
+ if(rmorphs.size()>1) rmorphs.remove(morph);
+ }
+
+ return null;
+
+ }
+
+ /**
+ * ë· ì´ì ì ìí´ íì¬ ì´ì ì´ ê²°ì ëë¤.
+ * ë· ì´ì ì NULLì´ ìëë¤.
+ * @param source
+ * @param pmorphs
+ * @param rmorphs
+ * @return
+ * @throws MorphException
+ */
+ private AnalysisOutput lookupBestByRWord(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
+
+ List<AnalysisOutput> removes = new ArrayList();
+
+ for(AnalysisOutput rmorph : rmorphs) {
+
+ if(rmorph.getScore()!=AnalysisOutput.SCORE_CORRECT) break;
+
+ String rend = rmorph.getJosa();
+ if(rend==null) rend = rmorph.getEomi();
+
+ for(AnalysisOutput pmorph : pmorphs) {
+
+ Iterator<String[]> iterw = getGR("R"+psource+"^W/");
+
+ String pend = pmorph.getJosa();
+ if(pend==null) pend = pmorph.getEomi();
+
+ AnalysisOutput best = selectBest(iterw, psource, rsource, pmorph, rmorph, false, removes);
+ if(best!=null) return best;
+
+ Iterator<String[]> iters = getGR("R"+NILL+"/"+pend+"/");
+ best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
+ if(best!=null) return best;
+
+ iters = getGR("R"+pmorph.getStem()+"^S/");
+ best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
+ if(best!=null) return best;
+
+ }
+
+ }
+
+ for(AnalysisOutput morph : removes) {
+ if(pmorphs.size()>1) pmorphs.remove(morph);
+ }
+
+ return null;
+
+ }
+
+ private AnalysisOutput selectBest(Iterator<String[]> iter, String psource, String rsource,
+ AnalysisOutput pmorph, AnalysisOutput rmorph, boolean rear, List removes) {
+
+ while(iter.hasNext()) {
+
+ String[] values = iter.next();
+
+ if(checkGrammer(values, psource, rsource, pmorph, rmorph, rear)) {
+ if(rear) return rmorph;
+ else return pmorph;
+ } else if("1".equals(values[6])) {
+ if(!removes.contains(pmorph)) removes.add(pmorph);
+ break;
+ }
+ }
+
+ return null;
+
+ }
+
+ private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) {
+
+ boolean ok = true;
+
+ String pend = pmorph.getJosa();
+ if(pend==null) pend = pmorph.getEomi();
+
+ String rend = rmorph.getJosa();
+ if(rend==null) rend = rmorph.getEomi();
+
+ if(depFront&&!NILL.equals(values[0])&&!checkWord(psource,values[0],pmorph)) { // ì ì´ì ì ì´í
+ return false;
+ }
+
+ if(!NILL.equals(values[1])&& !checkEomi(values[1], pend)) { // ì ì´ì ì ì´ë¯¸
+ return false;
+ }
+
+ if(!NOPATN.equals(values[2])&&!checkPattern(values[2], pmorph.getPatn())) {// ì ì´ì ì í¨í´
+ return false;
+ }
+
+ if(!depFront&&!NILL.equals(values[3])&&!checkWord(rsource,values[3],rmorph)) { // ë· ì´ì ì ì´í
+ return false;
+ }
+
+ if(!NILL.equals(values[4])&& !checkEomi(values[4], rend)) { // ë· ì´ì ì ì´ë¯¸
+ return false;
+ }
+
+ if(!NOPATN.equals(values[5]) && !checkPattern(values[5], rmorph.getPatn())) { // ë· ì´ì ì í¨í´
+ return false;
+ }
+
+ return true;
+
+ }
+
+ private boolean checkWord(String source, String value, AnalysisOutput morph) {
+
+ String[] types = StringUtil.split(value,"^");
+ String[] strs = StringUtil.split(types[0],",");
+
+ String text = source;
+ if("S".equals(types[1])) text = morph.getStem();
+
+ for(int i=0;i<strs.length;i++) {
+ if(strs[i].equals(text)) return true;
+ }
+
+ return false;
+ }
+
+ private boolean checkEomi(String value, String rend) {
+
+ String[] strs = StringUtil.split(value,",");
+
+ for(int i=0;i<strs.length;i++) {
+ if(strs[i].equals(rend)) return true;
+ }
+
+ return false;
+ }
+
+ private boolean checkPattern(String value, int ptn) {
+
+ String[] strs = StringUtil.split(value,",");
+ String strPtn = Integer.toString(ptn);
+
+ for(int i=0;i<strs.length;i++) {
+
+ if("E".equals(strs[i])&&ConstraintUtil.isEomiPhrase(ptn))
+ return true;
+ else if("J".equals(strs[i])&&
+ (ConstraintUtil.isJosaNounPhrase(ptn)||ptn==PatternConstants.PTN_N))
+ return true;
+ else if(strs[i].equals(strPtn))
+ return true;
+
+ }
+
+ return false;
+ }
+
+ public static synchronized Iterator<String[]> getGR(String prefix) throws MorphException {
+
+ if(occurrences==null) loadTaggerDic();
+
+ return occurrences.getPrefixedBy(prefix);
+ }
+
+ private static synchronized void loadTaggerDic() throws MorphException {
+
+ occurrences = new Trie(true);
+
+ try {
+
+ List<String> strs = FileUtil.readLines(KoreanEnv.getInstance().getValue(tagDicLoc), "UTF-8");
+
+ for(String str : strs) {
+ if(str==null) continue;
+ str = str.trim();
+ String[] syls = StringUtil.split(str,":");
+ if(syls.length!=4) continue;
+
+ String key = null;
+ if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
+ else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
+
+ String[] patns = StringUtil.split(syls[1]+"/"+syls[2]+"/"+syls[3],"/");
+
+ occurrences.add(syls[0]+key, patns);
+
+ }
+
+ } catch (Exception e) {
+ throw new MorphException("Fail to read the tagger dictionary.("+tagDicLoc+")\n"+e.getMessage());
+ }
+ }
+
+}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java Sun May 5 03:39:51 2013
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+
+/**
+ * ê²°í©ì´ ê°ë¥í ì¡°ê±´ì ì²ë¦¬íë í´ëì¤
+ */
+public class ConstraintUtil {
+
+ private static Map hahes = new HashMap(); // "ê¸ë¡ë²íí´ ", "민족íí´" ì²ë¼ íí´ì ê²°í©ì´ ê°ë¥í ëª
ì¬
+ static {
+ hahes.put("민족", "Y");hahes.put("ëì", "Y");hahes.put("ë¨ë¶", "Y");
+ }
+
+ private static Map eomiPnouns = new HashMap();
+ static {
+ eomiPnouns.put("ã´", "Y");eomiPnouns.put("ã¹", "Y");eomiPnouns.put("ã
", "Y");
+ }
+
+ private static Map PTN_MLIST= new HashMap();
+ static {
+ PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM);
+ PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM);
+ PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM);
+ PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM);
+ PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM);
+ PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM);
+ PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM);
+ }
+
+ private static Map PTN_JLIST= new HashMap();
+ static {
+ PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ);
+ PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ);
+ PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ);
+ PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ);
+ }
+
+ private static Map WORD_GUKS= new HashMap();
+ static {
+ WORD_GUKS.put("ë ê²", "Y");
+ WORD_GUKS.put("ë¤ê²", "Y");
+ WORD_GUKS.put("ë³ê²", "Y");
+ WORD_GUKS.put("ì°°ê²", "Y");
+ WORD_GUKS.put("íê²", "Y");
+ WORD_GUKS.put("íìê²", "Y");
+ }
+
+ // ì¢
ì±ì´ ìë ìì ê³¼ ì°ê²°ë ì ìë ì¡°ì¬
+ private static Map JOSA_TWO= new HashMap();
+ static {
+ JOSA_TWO.put("ê°", "Y");
+ JOSA_TWO.put("ë", "Y");
+ JOSA_TWO.put("ë¤", "Y");
+ JOSA_TWO.put("ë", "Y");
+ JOSA_TWO.put("ë", "Y");
+ JOSA_TWO.put("ê³ ", "Y");
+ JOSA_TWO.put("ë¼", "Y");
+ JOSA_TWO.put("ì", "Y");
+ JOSA_TWO.put("ë", "Y");
+ JOSA_TWO.put("를", "Y");
+ JOSA_TWO.put("ë©°", "Y");
+ JOSA_TWO.put("ë ", "Y");
+ JOSA_TWO.put("ì¼", "Y");
+ JOSA_TWO.put("ì¬", "Y");
+ }
+
+ // ì¢
ì±ì´ ìë ìì ê³¼ ì°ê²°ë ì ìë ì¡°ì¬
+ private static Map JOSA_THREE= new HashMap();
+ static {
+ JOSA_THREE.put("ê³¼", "Y");
+ JOSA_THREE.put("ì", "Y");
+ JOSA_THREE.put("ì", "Y");
+ JOSA_THREE.put("ì¼", "Y");
+ JOSA_THREE.put("ì", "Y");
+ JOSA_THREE.put("ì", "Y");
+ }
+
+ public static boolean canHaheCompound(String key) {
+ if(hahes.get(key)!=null) return true;
+ return false;
+ }
+
+ /**
+ * ì´ë¯¸ê° ã´,ã¹,ã
ì¼ë¡ ëëëì§ ì¡°ì¬íë¤.
+ * @param eomi
+ * @return
+ */
+ public static boolean isNLM(String eomi) {
+
+ if(eomi==null || "".equals(eomi)) return false;
+
+ if(eomiPnouns.get(eomi)!=null) return true;
+
+ char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1));
+ if(chrs.length==3 && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true;
+
+ return true;
+
+ }
+
+ public static boolean isEomiPhrase(int ptn) {
+
+ if(PTN_MLIST.get(ptn)!=null) return true;
+
+ return false;
+
+ }
+
+ public static boolean isJosaNounPhrase(int ptn) {
+
+ if(PTN_JLIST.get(ptn)!=null) return true;
+
+ return false;
+
+ }
+
+ public static boolean isJosaAdvPhrase(int ptn) {
+
+ if(PatternConstants.PTN_ADVJ==ptn) return true;
+
+ return false;
+
+ }
+
+ public static boolean isAdvPhrase(int ptn) {
+
+ if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true;
+
+ return false;
+
+ }
+
+ public static boolean isTwoJosa(String josa) {
+
+ return (JOSA_TWO.get(josa)!=null);
+
+ }
+ public static boolean isThreeJosa(String josa) {
+
+ return (JOSA_THREE.get(josa)!=null);
+
+ }
+}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java Sun May 5 03:39:51 2013
@@ -0,0 +1,308 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.CompoundEntry;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.WordEntry;
+
+public class DictionaryUtil {
+
+ private static Trie<String,WordEntry> dictionary;
+
+ private static HashMap josas;
+
+ private static HashMap eomis;
+
+ private static HashMap prefixs;
+
+ private static HashMap suffixs;
+
+ private static HashMap<String,WordEntry> uncompounds;
+
+ private static HashMap<String, String> cjwords;
+
+ /**
+ * ì¬ì ì ë¡ëíë¤.
+ */
+ public synchronized static void loadDictionary() throws MorphException {
+
+ dictionary = new Trie<String, WordEntry>(true);
+ List<String> strList = null;
+ List<String> compounds = null;
+ try {
+ strList = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_DICTIONARY),"UTF-8");
+ strList.addAll(FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_EXTENSION),"UTF-8"));
+ compounds = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_COMPOUNDS),"UTF-8");
+ } catch (IOException e) {
+ new MorphException(e.getMessage(),e);
+ } catch (Exception e) {
+ new MorphException(e.getMessage(),e);
+ }
+ if(strList==null) throw new MorphException("dictionary is null");;
+
+ for(String str:strList) {
+ String[] infos = StringUtil.split(str,",");
+ if(infos.length!=2) continue;
+ infos[1] = infos[1].trim();
+ if(infos[1].length()==6) infos[1] = infos[1].substring(0,5)+"000"+infos[1].substring(5);
+
+ WordEntry entry = new WordEntry(infos[0].trim(),infos[1].trim().toCharArray());
+ dictionary.add(entry.getWord(), entry);
+ }
+
+ for(String compound: compounds) {
+ String[] infos = StringUtil.split(compound,":");
+ if(infos.length!=2) continue;
+ WordEntry entry = new WordEntry(infos[0].trim(),"20000000X".toCharArray());
+ entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1],",")));
+ dictionary.add(entry.getWord(), entry);
+ }
+ }
+
+ public static Iterator findWithPrefix(String prefix) throws MorphException {
+ if(dictionary==null) loadDictionary();
+ return dictionary.getPrefixedBy(prefix);
+ }
+
+ public static WordEntry getWord(String key) throws MorphException {
+ if(dictionary==null) loadDictionary();
+ if(key.length()==0) return null;
+
+ return (WordEntry)dictionary.get(key);
+ }
+
+ public static WordEntry getWordExceptVerb(String key) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_NOUN)=='1'||
+ entry.getFeature(WordEntry.IDX_BUSA)=='1') return entry;
+ return null;
+ }
+
+ public static WordEntry getNoun(String key) throws MorphException {
+
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_NOUN)=='1') return entry;
+ return null;
+ }
+
+ public static WordEntry getCNoun(String key) throws MorphException {
+
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_NOUN)=='1' || entry.getFeature(WordEntry.IDX_NOUN)=='2') return entry;
+ return null;
+ }
+
+ public static WordEntry getVerb(String key) throws MorphException {
+
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_VERB)=='1') {
+ return entry;
+ }
+ return null;
+ }
+
+ public static WordEntry getAdverb(String key) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_BUSA)=='1') return entry;
+ return null;
+ }
+
+ public static WordEntry getBusa(String key) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_BUSA)=='1'&&entry.getFeature(WordEntry.IDX_NOUN)=='0') return entry;
+ return null;
+ }
+
+ public static WordEntry getIrrVerb(String key, char irrType) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_VERB)=='1'&&
+ entry.getFeature(WordEntry.IDX_REGURA)==irrType) return entry;
+ return null;
+ }
+
+ public static WordEntry getBeVerb(String key) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_BEV)=='1') return entry;
+ return null;
+ }
+
+ public static WordEntry getDoVerb(String key) throws MorphException {
+ WordEntry entry = getWord(key);
+ if(entry==null) return null;
+
+ if(entry.getFeature(WordEntry.IDX_DOV)=='1') return entry;
+ return null;
+ }
+
+ public static WordEntry getUncompound(String key) throws MorphException {
+
+ try {
+ if(uncompounds==null) {
+ uncompounds = new HashMap();
+ List<String> lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_UNCOMPOUNDS),"UTF-8");
+ for(String compound: lines) {
+ String[] infos = StringUtil.split(compound,":");
+ if(infos.length!=2) continue;
+ WordEntry entry = new WordEntry(infos[0].trim(),"90000X".toCharArray());
+ entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1],",")));
+ uncompounds.put(entry.getWord(), entry);
+ }
+ }
+ }catch(Exception e) {
+ throw new MorphException(e);
+ }
+ return uncompounds.get(key);
+ }
+
+ public static String getCJWord(String key) throws MorphException {
+
+ try {
+ if(cjwords==null) {
+ cjwords = new HashMap();
+ List<String> lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_CJ),"UTF-8");
+ for(String cj: lines) {
+ String[] infos = StringUtil.split(cj,":");
+ if(infos.length!=2) continue;
+ cjwords.put(infos[0], infos[1]);
+ }
+ }
+ }catch(Exception e) {
+ throw new MorphException(e);
+ }
+ return cjwords.get(key);
+
+ }
+
+ public static boolean existJosa(String str) throws MorphException {
+ if(josas==null) {
+ josas = new HashMap();
+ readFile(josas,KoreanEnv.FILE_JOSA);
+ }
+ if(josas.get(str)==null) return false;
+ else return true;
+ }
+
+ public static boolean existEomi(String str) throws MorphException {
+ if(eomis==null) {
+ eomis = new HashMap();
+ readFile(eomis,KoreanEnv.FILE_EOMI);
+ }
+
+ if(eomis.get(str)==null) return false;
+ else return true;
+ }
+
+ public static boolean existPrefix(String str) throws MorphException {
+ if(prefixs==null) {
+ prefixs = new HashMap();
+ readFile(prefixs,KoreanEnv.FILE_PREFIX);
+ }
+
+ if(prefixs.get(str)==null) return false;
+ else return true;
+ }
+
+ public static boolean existSuffix(String str) throws MorphException {
+ if(suffixs==null) {
+ suffixs = new HashMap();
+ readFile(suffixs,KoreanEnv.FILE_SUFFIX);
+ }
+
+ if(suffixs.get(str)!=null) return true;
+
+ return false;
+ }
+
+ /**
+ * ã´,ã¹,ã
,ã
ê³¼ eomi ê° ê²°í©íì¬ ì´ë¯¸ê° ë ì ìëì§ ì ê²íë¤.
+ * @param s
+ * @param end
+ * @return
+ */
+ public static String combineAndEomiCheck(char s, String eomi) throws MorphException {
+
+ if(eomi==null) eomi="";
+
+ if(s=='ã´') eomi = "ì"+eomi;
+ else if(s=='ã¹') eomi = "ì"+eomi;
+ else if(s=='ã
') eomi = "ì"+eomi;
+ else if(s=='ã
') eomi = "ìµ"+eomi;
+ else eomi = s+eomi;
+
+ if(existEomi(eomi)) return eomi;
+
+ return null;
+
+ }
+
+ /**
+ *
+ * @param map
+ * @param type 1: josa, 2: eomi
+ * @throws MorphException
+ */
+ private static synchronized void readFile(HashMap map, String dic) throws MorphException {
+
+ String path = KoreanEnv.getInstance().getValue(dic);
+
+ try{
+ List<String> line = FileUtil.readLines(path,"UTF-8");
+ for(int i=1;i<line.size();i++) {
+ map.put(line.get(i).trim(), line.get(i));
+ }
+ }catch(IOException e) {
+ throw new MorphException(e.getMessage(),e);
+ } catch (Exception e) {
+ throw new MorphException(e.getMessage(),e);
+ }
+ }
+
+ private static List compoundArrayToList(String source, String[] arr) {
+ List list = new ArrayList();
+ for(String str: arr) {
+ CompoundEntry ce = new CompoundEntry(str);
+ ce.setOffset(source.indexOf(str));
+ list.add(ce);
+ }
+ return list;
+ }
+}
+
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java?rev=1479234&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java Sun May 5 03:39:51 2013
@@ -0,0 +1,665 @@
+package org.apache.lucene.analysis.kr.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
+import org.apache.lucene.analysis.kr.morph.MorphException;
+import org.apache.lucene.analysis.kr.morph.PatternConstants;
+
+public class EomiUtil {
+
+
+ public static final String RESULT_FAIL = "0";
+
+ public static final String RESULT_SUCCESS = "1";
+
+ public static final String[] verbSuffix = {
+ "ì´","í","ë","ì¤ë½","ì¤ë¬ì°","ìí¤","ì","ì","ê°","ë¹í","ë§í","ë리","ë°","ë","ë´"
+ };
+
+ /**
+ * ê°ì¥ 길ì´ê° 긴 ì´ë¯¸ë¥¼ ë¶ë¦¬íë¤.
+ * @param term
+ * @return
+ * @throws MorphException
+ */
+ public static String[] longestEomi(String term) throws MorphException {
+
+ String[] result = new String[2];
+ result[0] = term;
+
+ String stem;
+ String eomi;
+ char[] efeature;
+
+ for(int i=term.length();i>0;i--) {
+
+ stem = term.substring(0,i);
+
+ if(i!=term.length()) {
+ eomi = term.substring(i);
+ efeature = SyllableUtil.getFeature(eomi.charAt(0));
+ } else {
+ efeature = SyllableUtil.getFeature(stem.charAt(i-1));
+ eomi="";
+ }
+
+ if(SyllableUtil.isAlpanumeric(stem.charAt(i-1))) break;
+
+ char[] jasos = MorphUtil.decompose(stem.charAt(i-1));
+
+ if(!"".equals(eomi)&&!DictionaryUtil.existEomi(eomi)) {
+ // do not anything.
+ } else if(jasos.length>2&&
+ (jasos[2]=='ã´'||jasos[2]=='ã¹'||jasos[2]=='ã
'||jasos[2]=='ã
')&&
+ DictionaryUtil.combineAndEomiCheck(jasos[2], eomi)!=null) {
+ result[0] = Character.toString(MorphUtil.makeChar(stem.charAt(i-1), 0));
+ if(i!=0) result[0] = stem.substring(0,i-1)+result[0];
+ result[1] = Character.toString(jasos[2]);
+ }else if(i>0&&(stem.endsWith("í")&&"ì¬".equals(eomi))||
+ (stem.endsWith("ê°")&&"ê±°ë¼".equals(eomi))||
+ (stem.endsWith("ì¤")&&"ëë¼".equals(eomi))) {
+ result[0] = stem;
+ result[1] = eomi;
+ }else if(jasos.length==2&&(!stem.endsWith("ì")&&!stem.endsWith("ì´"))&&
+ (jasos[1]=='ã
'||jasos[1]=='ã
'||jasos[1]=='ã
'||jasos[1]=='ã
')&&
+ (DictionaryUtil.combineAndEomiCheck('ì´', eomi)!=null)) {
+ char[] chs = MorphUtil.decompose(stem.charAt(stem.length()-1));
+ result[0] = stem;
+ result[1] = "ì´"+eomi;
+ }else if((jasos[1]=='ã
'||jasos[1]=='ã
'||jasos[1]=='ã
'||jasos[1]=='ã
'||jasos[1]=='ã
')&&
+ (DictionaryUtil.combineAndEomiCheck('ì´', eomi)!=null)) {
+ String end = "";
+ if(jasos[1]=='ã
')
+ end=MorphUtil.makeChar(stem.charAt(i-1), 8, 0)+"ì";
+ else if(jasos[1]=='ã
')
+ end=MorphUtil.makeChar(stem.charAt(i-1), 13, 0)+"ì´";
+ else if(jasos[1]=='ã
')
+ end=Character.toString(MorphUtil.makeChar(stem.charAt(i-1), 6, 0));
+ else if(jasos[1]=='ã
')
+ end=MorphUtil.makeChar(stem.charAt(i-1), 0, 0)+"ì´";
+ else if(jasos[1]=='ã
')
+ end=MorphUtil.makeChar(stem.charAt(i-1), 20, 0)+"ì ";
+
+ if(jasos.length==3) {
+ end = end.substring(0,end.length()-1)+MorphUtil.replaceJongsung(end.charAt(end.length()-1),stem.charAt(i-1));
+ }
+
+ if(stem.length()<2) result[0] = end;
+ else result[0] = stem.substring(0,stem.length()-1)+end;
+ result[1] = eomi;
+
+ }else if(efeature!=null&&efeature[SyllableUtil.IDX_EOMI1]!='0'&&
+ DictionaryUtil.existEomi(eomi)) {
+ if(!(((jasos.length==2&&jasos[0]=='ã¹')||(jasos.length==3&&jasos[2]=='ã¹'))&&eomi.equals("ë¬"))) { // ã¹ ë¶ê·ì¹ì ìì¸
+ result[0] = stem;
+ result[1] = eomi;
+ }
+ }
+
+ if(efeature!=null&&efeature[SyllableUtil.IDX_EOMI2]=='0') break;
+ }
+
+ return result;
+
+ }
+
+ /**
+ * ì ì´ë§ì´ë¯¸ë¥¼ ë¶ìíë¤.
+ * @param stem
+ * @return
+ */
+ public static String[] splitPomi(String stem) throws MorphException {
+
+ // results[0]:ì±ê³µ(1)/ì¤í¨(0), results[1]: ì´ê·¼, results[2]: ì ì´ë§ì´ë¯¸
+ String[] results = new String[2];
+ results[0] = stem;
+
+ if(stem==null||stem.length()==0||"ì".equals(stem)) return results;
+
+ char[] chrs = stem.toCharArray();
+ int len = chrs.length;
+ String pomi = "";
+ int index = len-1;
+
+ char[] jaso = MorphUtil.decompose(chrs[index]);
+ if(chrs[index]!='ì'&&chrs[index]!='ã
'&&jaso[jaso.length-1]!='ã
') return results; // ì ì´ë§ì´ë¯¸ê° ë°ê²¬ëì§ ììë¤
+
+ if(chrs[index]=='ê² ') {
+ pomi = "ê² ";
+ setPomiResult(results,stem.substring(0,index),pomi);
+ if(--index<=0||
+ (chrs[index]!='ì'&&chrs[index]!='ã
'&&jaso[jaso.length-1]!='ã
'))
+ return results; // ë¤ìì´ê±°ë ì ì´ë§ì´ë¯¸ê° ìë¤ë©´...
+ jaso = MorphUtil.decompose(chrs[index]);
+ }
+
+ if(chrs[index]=='ì') { // ìì, ã
ì, ì
+ pomi = chrs[index]+pomi;
+ setPomiResult(results,stem.substring(0,index),pomi);
+ if(--index<=0||
+ (chrs[index]!='ì'&&chrs[index]!='ã
'&&jaso[jaso.length-1]!='ã
'))
+ return results; // ë¤ìì´ê±°ë ì ì´ë§ì´ë¯¸ê° ìë¤ë©´...
+ jaso = MorphUtil.decompose(chrs[index]);
+ }
+
+ if(chrs[index]=='ì'){
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ if(index>0&&chrs[index-1]=='í')
+ stem = stem.substring(0,index);
+ else
+ stem = stem.substring(0,index)+"ì´";
+ setPomiResult(results,stem,pomi);
+ }else if(chrs[index]=='ì
¨'){
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ stem = stem.substring(0,index);
+ setPomiResult(results,stem,"ì"+pomi);
+ }else if(chrs[index]=='ì'||chrs[index]=='ì') {
+ pomi = chrs[index]+pomi;
+ setPomiResult(results,stem.substring(0,index),pomi);
+ if(--index<=0||
+ (chrs[index]!='ì'&&chrs[index]!='ì¼')) return results; // ë¤ìì´ê±°ë ì ì´ë§ì´ë¯¸ê° ìë¤ë©´...
+ jaso = MorphUtil.decompose(chrs[index]);
+ }else if(jaso.length==3&&jaso[2]=='ã
') {
+
+ if(jaso[0]=='ã
'&&jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ stem = stem.substring(0,index)+"í";
+ }else if(jaso[0]!='ã
'&&(jaso[1]=='ã
'||jaso[1]=='ã
'||jaso[1]=='ã
'||jaso[1]=='ã
')) {
+ pomi = "ì"+pomi;
+ stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index], 0);
+ }else if(jaso[0]!='ã
'&&(jaso[1]=='ã
')) {
+ pomi = "ì"+pomi;
+ stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],11, 0);
+ } else if(jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì',chrs[index])+pomi;
+ stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],8, 0);
+ } else if(jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],13, 0);
+ } else if(jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],20, 0);
+ } else if(jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì´',chrs[index])+pomi;
+ stem = stem.substring(0,index);
+ } else if(jaso[1]=='ã
') {
+ pomi = MorphUtil.replaceJongsung('ì ',chrs[index])+pomi;
+ stem = stem.substring(0,index);
+ } else {
+ pomi = "ì"+pomi;
+ }
+ setPomiResult(results,stem,pomi);
+ if(chrs[index]!='ì'&&chrs[index]!='ì¼') return results; // ë¤ìì´ê±°ë ì ì´ë§ì´ë¯¸ê° ìë¤ë©´...
+ jaso = MorphUtil.decompose(chrs[index]);
+ }
+
+ char[] nChrs = null;
+ if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
+ else nChrs = new char[2];
+
+ if(nChrs.length==2&&chrs[index]=='ì'&&(chrs.length<=index+1||
+ (chrs.length>index+1&&chrs[index+1]!='ì
¨'))) {
+ if(DictionaryUtil.getWord(results[0])!=null) return results; //'ì'ê° í¬í¨ë ë¨ì´ê° ìë¤. ì±ê°ìë¤/ëìë¤/ë¤ì¤ìë¤
+ pomi = chrs[index]+pomi;
+ setPomiResult(results,stem.substring(0,index),pomi);
+ if(--index==0||chrs[index]!='ì¼') return results; // ë¤ìì´ê±°ë ì ì´ë§ì´ë¯¸ê° ìë¤ë©´...
+ jaso = MorphUtil.decompose(chrs[index]);
+ }
+
+ if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
+ else nChrs = new char[2];
+ if(chrs.length>index+1&&nChrs.length==3&&(chrs[index+1]=='ì
¨'||chrs[index+1]=='ì')&&chrs[index]=='ì¼') {
+ pomi = chrs[index]+pomi;
+ setPomiResult(results,stem.substring(0,index),pomi);
+ }
+
+ return results;
+ }
+
+ /**
+ * ë¶ê·ì¹ ì©ì¸ì ìíì 구íë¤.
+ * @param output
+ * @return
+ * @throws MorphException
+ */
+ public static List irregular(AnalysisOutput output) throws MorphException {
+
+ List results = new ArrayList();
+
+ if(output.getStem()==null||output.getStem().length()==0)
+ return results;
+
+ String ending = output.getEomi();
+ if(output.getPomi()!=null) ending = output.getPomi();
+
+ List<String[]> irrs = new ArrayList();
+
+ irregularStem(irrs,output.getStem(),ending);
+ irregularEnding(irrs,output.getStem(),ending);
+ irregularAO(irrs,output.getStem(),ending);
+
+ try {
+ for(String[] irr: irrs) {
+ AnalysisOutput result = output.clone();
+ result.setStem(irr[0]);
+ if(output.getPatn()==PatternConstants.PTN_VM) {
+ if(output.getPomi()==null) result.setEomi(irr[1]);
+ else result.setPomi(irr[1]);
+ }
+ results.add(result);
+ }
+ } catch (CloneNotSupportedException e) {
+ throw new MorphException(e.getMessage(),e);
+ }
+
+ return results;
+
+ }
+
+ /**
+ * ì´ê°ë§ ë³íë ê²½ì°
+ * @param results
+ * @param stem
+ * @param ending
+ */
+ private static void irregularStem(List results, String stem, String ending) {
+
+ char feCh = ending.charAt(0);
+ char[] fechJaso = MorphUtil.decompose(feCh);
+ char ls = stem.charAt(stem.length()-1);
+ char[] lsJaso = MorphUtil.decompose(ls);
+
+ if(feCh=='ì'||feCh=='ì´'||feCh=='ì¼') {
+ if(lsJaso[lsJaso.length-1]=='ã¹') { // ã· ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),7)
+ ,ending
+ ,String.valueOf(PatternConstants.IRR_TYPE_DI)});
+ } else if(lsJaso.length==2) { // ã
ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),19)
+ ,ending
+ ,String.valueOf(PatternConstants.IRR_TYPE_SI)});
+ }
+ }
+
+ if((fechJaso[0]=='ã´'||fechJaso[0]=='ã¹'||fechJaso[0]=='ã
'|| feCh=='ì¤'||feCh=='ì')
+ &&(ls=='ì°')) { // ã
ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),17)
+ ,ending
+ ,String.valueOf(PatternConstants.IRR_TYPE_BI)});
+ }
+
+ if((fechJaso[0]=='ã´'||fechJaso[0]=='ã
'||fechJaso[0]=='ã
'|| feCh=='ì¤')
+ &&(lsJaso.length==2)) { // ã¹ íë½
+
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),8)
+ ,ending
+ ,String.valueOf(PatternConstants.IRR_TYPE_LI)});
+ }
+
+ if(lsJaso.length==2
+ &&(fechJaso[0]=='ã´'||fechJaso[0]=='ã¹'||fechJaso[0]=='ã
'||fechJaso[0]=='ã
'||
+ lsJaso[1]=='ã
'||lsJaso[1]=='ã
'||lsJaso[1]=='ã
'||lsJaso[1]=='ã
')
+ &&!"ë".equals(stem)) { // ã
ë¶ê·ì¹, ê·¸ë¬ë [ë³ë¤]ë ã
ë¶ê·ì¹ì´ ìëë¤.
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),27)
+ ,ending
+ ,String.valueOf(PatternConstants.IRR_TYPE_HI)});
+ }
+ }
+
+ /**
+ * ì´ë¯¸ë§ ë³íë ê²½ì°
+ * @param results
+ * @param stem
+ * @param ending
+ */
+ private static void irregularEnding(List results, String stem, String ending) {
+ if(ending.startsWith("ã
")) return;
+
+ char feCh = ending.charAt(0);
+ char ls = stem.charAt(stem.length()-1);
+
+ if(feCh=='ë¬'&&ls=='르') { // 'ë¬' ë¶ê·ì¹
+ results.add(
+ new String[]{stem
+ ,"ì´"+ending.substring(1)
+ ,String.valueOf(PatternConstants.IRR_TYPE_RO)});
+ } else if("ë¼".equals(ending)&&"ê°ê±°".equals(stem)) { // 'ê±°ë¼' ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)
+ ,"ì´ë¼"
+ ,String.valueOf(PatternConstants.IRR_TYPE_GU)});
+ } else if("ë¼".equals(ending)&&"ì¤ë".equals(stem)) { // 'ëë¼' ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)
+ ,"ì´ë¼"
+ ,String.valueOf(PatternConstants.IRR_TYPE_NU)});
+ }
+
+ if("ì¬".equals(ending)&&ls=='í') { // 'ì¬' ë¶ê·ì¹
+ results.add(
+ new String[]{stem
+ ,"ì´"
+ ,String.valueOf(PatternConstants.IRR_TYPE_NU)});
+ }
+ }
+
+ /**
+ * ì´ê°ê³¼ ì´ë¯¸ê° 모ë ë³íë ê²½ì°
+ * @param results
+ * @param stem
+ * @param ending
+ */
+ private static void irregularAO(List results, String stem, String ending) {
+
+ char ls = stem.charAt(stem.length()-1);
+ char[] lsJaso = MorphUtil.decompose(ls);
+
+ if(lsJaso.length<2) return;
+
+ if(lsJaso[1]=='ã
') {
+ if(stem.endsWith("ëì")||stem.endsWith("ê³ ì")) { // 'ê³±ë¤', 'ëë¤'ì 'ã
' ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-2)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-2),17) // + 'ã
'
+ ,makeTesnseEomi("ì",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_BI)});
+ }else { // 'ì' ì¶ì½
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),8,0) // ìì + ã
+ ,makeTesnseEomi("ì",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_WA)});
+ }
+ } else if(stem.endsWith("í¼")) {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0) // ìì + -
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_WA)});
+ } else if(lsJaso[1]=='ã
') {
+ if(stem.length()>=2) // 'ã
' ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-2)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-2),17) // + 'ã
'
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_BI)});
+
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),13,0) // ìì + ã
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_WA)});
+ } else if(stem.length()>=2&&ls=='ë¼') {
+ char[] ns = MorphUtil.decompose(stem.charAt(stem.length()-2));
+ if(ns.length==3&&ns[2]=='ã¹') { // 르 ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-2)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-2),0) + "르"
+ ,makeTesnseEomi("ì",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_RO)});
+ }
+ } else if(stem.length()>=2&&ls=='ë¬') {
+ char[] ns = MorphUtil.decompose(stem.charAt(stem.length()-2));
+ if(stem.charAt(stem.length()-2)=='르') { // ë¬ ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_LO)});
+ } else if(ns.length==3&&ns[2]=='ã¹') { // 르 ë¶ê·ì¹
+ results.add(
+ new String[]{stem.substring(0,stem.length()-2)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-2),0) + "르"
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_RO)});
+ }
+ } else if(stem.endsWith("í´")||stem.endsWith("ì¼")) {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),20,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_EI)});
+ } else if(stem.endsWith("í´")) {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),0,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_EI)});
+ } else if(lsJaso.length==2&&lsJaso[1]=='ã
') {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_UO)});
+ } else if(lsJaso.length==2&&lsJaso[1]=='ã
') {
+ // ì¼ íë½
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),18,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_UO)});
+ // ì ë¶ê·ì¹
+ results.add(
+ new String[]{stem
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_AH)});
+ } else if(lsJaso[1]=='ã
') {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),20,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_EI)});
+ } else if(lsJaso[1]=='ã
') {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),11,0)
+ ,makeTesnseEomi("ì´",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_OE)});
+ } else if(lsJaso[1]=='ã
') {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),0,27)
+ ,makeTesnseEomi("ì",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_HI)});
+ } else if(lsJaso[1]=='ã
') {
+ results.add(
+ new String[]{stem.substring(0,stem.length()-1)+
+ MorphUtil.makeChar(stem.charAt(stem.length()-1),2,27)
+ ,makeTesnseEomi("ì",ending)
+ ,String.valueOf(PatternConstants.IRR_TYPE_HI)});
+ }
+
+ }
+
+ /**
+ * ìì ì ì´ë¯¸ë§ì ë§ë¤ì´ì ë°ííë¤.
+ * @param preword 'ì' ëë 'ì´'
+ * @param endword ì´ë¯¸[ì ì´ë¯¸ë§ì í¬í¨]
+ * @return 'ì' ëë 'ì'ì ë§ë¤ì´ì ë°ííë¤.
+ */
+ public static String makeTesnseEomi(String preword, String endword) {
+
+ if(preword==null||preword.length()==0) return endword;
+ if(endword==null||endword.length()==0) return preword;
+
+ if(endword.charAt(0)=='ã
') {
+ return preword.substring(0,preword.length()-1)+
+ MorphUtil.makeChar(preword.charAt(preword.length()-1),20)+endword.substring(1,endword.length());
+ } else if(endword.charAt(0)=='ã´') {
+ return preword.substring(0,preword.length()-1)+
+ MorphUtil.makeChar(preword.charAt(preword.length()-1),4)+endword.substring(1,endword.length());
+ } else if(endword.charAt(0)=='ã¹') {
+ return preword.substring(0,preword.length()-1)+
+ MorphUtil.makeChar(preword.charAt(preword.length()-1),8)+endword.substring(1,endword.length());
+ } else if(endword.charAt(0)=='ã
') {
+ return preword.substring(0,preword.length()-1)+
+ MorphUtil.makeChar(preword.charAt(preword.length()-1),16)+endword.substring(1,endword.length());
+ } else if(endword.charAt(0)=='ã
') {
+ return preword.substring(0,preword.length()-1)+
+ MorphUtil.makeChar(preword.charAt(preword.length()-1),17)+endword.substring(1,endword.length());
+ }
+ return preword+endword;
+ }
+
+
+
+ /**
+ * 'ì/기' + 'ì´' + ì´ë¯¸, 'ìì/ë¶í°/ììë¶í°' + 'ì´' + ì´ë¯¸ ì¸ì§ ì¡°ì¬íë¤.
+ * @param stem
+ * @return
+ */
+ public static boolean endsWithEEomi(String stem) {
+ int len = stem.length();
+ if(len<2||!stem.endsWith("ì´")) return false;
+
+ char[] jasos = MorphUtil.decompose(stem.charAt(len-2));
+ if(jasos.length==3&&jasos[2]=='ã
')
+ return true;
+ else {
+ int index = stem.lastIndexOf("기");
+ if(index==-1) index = stem.lastIndexOf("ìì");
+ if(index==-1) index = stem.lastIndexOf("ë¶í°");
+ if(index==-1) return false;
+ return true;
+ }
+ }
+
+ private static void setPomiResult(String[] results,String stem, String pomi ) {
+ results[0] = stem;
+ results[1] = pomi;
+ }
+
+ /**
+ *
+ * @param ch
+ * @return
+ */
+ public static boolean IsNLMBSyl(char ech, char lch) throws MorphException {
+
+ char[] features = SyllableUtil.getFeature(ech);
+
+ switch(lch) {
+
+ case 'ã´' :
+ return (features[SyllableUtil.IDX_YNPNA]=='1' || features[SyllableUtil.IDX_YNPLN]=='1');
+ case 'ã¹' :
+ return (features[SyllableUtil.IDX_YNPLA]=='1');
+ case 'ã
' :
+ return (features[SyllableUtil.IDX_YNPMA]=='1');
+ case 'ã
' :
+ return (features[SyllableUtil.IDX_YNPBA]=='1');
+ }
+
+ return false;
+ }
+
+ /**
+ * ì´ë¯¸ë¥¼ ë¶ë¦¬íë¤.
+ *
+ * 1. ê·ì¹ì©ì¸ê³¼ ì´ê°ë§ ë°ëë ë¶ê·ì¹ ì©ì¸
+ * 2. ì´ë¯¸ê° ì¢
ì± 'ã´/ã¹/ã
/ã
'ì¼ë¡ ììëë ì´ì
+ * 3. 'ì¬/ê±°ë¼/ëë¼'ì ë¶ê·ì¹ ì´ì
+ * 4. ì´ë¯¸ 'ì/ì´'ê° íë½ëë ì´ì
+ * 5. 'ì/ì´'ì ë³ì´ì²´ ë¶ë¦¬
+ *
+ * @param stem
+ * @param end
+ * @return
+ * @throws MorphException
+ */
+ public static String[] splitEomi(String stem, String end) throws MorphException {
+
+ String[] strs = new String[2];
+ int strlen = stem.length();
+ if(strlen==0) return strs;
+
+ char estem = stem.charAt(strlen-1);
+ char[] chrs = MorphUtil.decompose(estem);
+ if(chrs.length==1) return strs; // íê¸ì´ ìëë¼ë©´...
+
+ if((chrs.length==3)&&(chrs[2]=='ã´'||chrs[2]=='ã¹'||chrs[2]=='ã
'||chrs[2]=='ã
')&&
+ EomiUtil.IsNLMBSyl(estem,chrs[2])&&
+ DictionaryUtil.combineAndEomiCheck(chrs[2], end)!=null) {
+ strs[1] = Character.toString(chrs[2]);
+ if(end.length()>0) strs[1] += end;
+ strs[0] = stem.substring(0,strlen-1) + MorphUtil.makeChar(estem, 0);
+ } else if(estem=='í´'&&DictionaryUtil.existEomi("ì´"+end)) {
+ strs[0] = stem.substring(0,strlen-1)+"í";
+ strs[1] = "ì´"+end;
+ } else if(estem=='í'&&DictionaryUtil.existEomi("ì´"+end)) {
+ strs[0] = stem.substring(0,strlen-1)+"í";
+ strs[1] = "ì´"+end;
+ } else if(chrs[0]!='ã
'&&
+ (chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
')&&
+ (chrs.length==2 || SyllableUtil.getFeature(estem)[SyllableUtil.IDX_YNPAH]=='1')&&
+ (DictionaryUtil.combineAndEomiCheck('ì´', end)!=null)) {
+
+ strs[0] = stem;
+ if(chrs.length==2) strs[1] = "ì´"+end;
+ else strs[1] = end;
+ } else if(stem.endsWith("í")&&"ì¬".equals(end)) {
+ strs[0] = stem;
+ strs[1] = "ì´";
+ }else if((chrs.length==2)&&(chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
')&&
+ (DictionaryUtil.combineAndEomiCheck('ì´', end)!=null)) {
+
+ StringBuffer sb = new StringBuffer();
+
+ if(strlen>1) sb.append(stem.substring(0,strlen-1));
+
+ if(chrs[1]=='ã
')
+ sb.append(MorphUtil.makeChar(estem, 8, 0)).append(MorphUtil.replaceJongsung('ì',estem));
+ else if(chrs[1]=='ã
')
+ sb.append(MorphUtil.makeChar(estem, 13, 0)).append(MorphUtil.replaceJongsung('ì´',estem));
+ else if(chrs[1]=='ã
')
+ sb.append(MorphUtil.makeChar(estem, 11, 0)).append(MorphUtil.replaceJongsung('ì´',estem));
+ else if(chrs[1]=='ã
')
+ sb.append(Character.toString(MorphUtil.makeChar(estem, 20, 0))).append(MorphUtil.replaceJongsung('ì´',estem));
+ else if(chrs[1]=='ã
')
+ sb.append(MorphUtil.makeChar(estem, 0, 0)).append(MorphUtil.replaceJongsung('ì´',estem));
+ else if(chrs[1]=='ã
')
+ sb.append(MorphUtil.makeChar(estem, 20, 0)).append(MorphUtil.replaceJongsung('ì ',estem));
+
+
+ strs[0] = sb.toString();
+
+ end = strs[0].substring(strs[0].length()-1)+end;
+ strs[0] = strs[0].substring(0,strs[0].length()-1);
+
+ strs[1] = end;
+
+ }else if(!"".equals(end)&&DictionaryUtil.existEomi(end)) {
+ strs = new String[]{stem, end};
+ }
+
+ return strs;
+ }
+}