You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2013/05/06 00:26:36 UTC
svn commit: r1479410 [1/10] - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang: ./
src/java/org/apache/lucene/analysis/kr/
src/java/org/apache/lucene/analysis/kr/morph/
src/java/org/apache/lucene/analysis/kr/tagging/ src/java/org/apache/lucen...
Author: sarowe
Date: Sun May 5 22:26:35 2013
New Revision: 1479410
URL: http://svn.apache.org/r1479410
Log:
LUCENE-4956: - svn:eol-style -> native
- tabs -> spaces
- regularized java code indents to 2 spaces per level
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/ivy.xml (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/IndexWord.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilter.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilterFactory.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerFactory.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.java (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutput.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundNounAnalyzer.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzer.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzerManager.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/MorphException.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/NounProperty.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/PatternConstants.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/SpaceOutput.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/Status.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WSAOutput.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WSCandidateComparator.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WSOuputComparator.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WSOutput.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordEntry.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/WordSpaceAnalyzer.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/tagging/Tagger.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/DictionaryUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/EomiUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/FileUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/HanjaUtils.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/IrregularUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/JarResources.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/KoreanEnv.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/MorphUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/NounUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/StrBuilder.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/StringEscapeUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/SyllableUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/Trie.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/UnhandledException.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/UnmodifiableIterator.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/Utilities.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/utils/VerbUtil.java (contents, props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/cj.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/compounds.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/eomi.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/extension.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/josa.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/mapHanja.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/occurrence.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/prefix.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/suffix.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/syllable.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/total.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/dic/uncompounds.dic (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/kr/korean.properties (props changed)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/kr/TestKoreanAnalyzer.java (props changed)
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml Sun May 5 22:26:35 2013
@@ -22,13 +22,13 @@
<description>
Korean Morphological Analyzer
</description>
-
+
<import file="../analysis-module-build.xml"/>
<path id="classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
-
+
<target name="compile-core" depends="jar-analyzers-common, common.compile-core"/>
</project>
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/IndexWord.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/IndexWord.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/IndexWord.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/IndexWord.java Sun May 5 22:26:35 2013
@@ -22,34 +22,32 @@ package org.apache.lucene.analysis.kr;
*/
public class IndexWord {
- private String word;
-
- private int offset = 0;
-
- public IndexWord() {
-
- }
-
- public IndexWord(String word, int pos) {
- this.word = word;
- this.offset = pos;
- }
-
- public String getWord() {
- return word;
- }
+ private String word;
+
+ private int offset = 0;
+
+ public IndexWord() {
+
+ }
+
+ public IndexWord(String word, int pos) {
+ this.word = word;
+ this.offset = pos;
+ }
+
+ public String getWord() {
+ return word;
+ }
- public void setWord(String word) {
- this.word = word;
- }
-
- public int getOffset() {
- return offset;
- }
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public int getOffset() {
+ return offset;
+ }
- public void setOffset(int offset) {
- this.offset = offset;
- }
-
-
+ public void setOffset(int offset) {
+ this.offset = offset;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java Sun May 5 22:26:35 2013
@@ -40,130 +40,128 @@ import org.apache.lucene.util.Version;
* @version $Id: KoreanAnalyzer.java,v 1.2 2013/04/07 13:09:33 smlee0818 Exp $
*/
public class KoreanAnalyzer extends StopwordAnalyzerBase {
-
- /** Default maximum allowed token length */
- public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
-
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- /**
- * Specifies whether deprecated acronyms should be replaced with HOST type.
- * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
- */
- private final boolean replaceInvalidAcronym;
-
- private Set<String> stopSet;
-
- private boolean bigrammable = true;
-
- private boolean hasOrigin = true;
-
- private boolean exactMatch = false;
-
- private boolean originCNoun = true;
-
- public static final String DIC_ENCODING = "UTF-8";
-
- /** An unmodifiable set containing some common English words that are usually not
- useful for searching. */
- public static final CharArraySet STOP_WORDS_SET;
-
-
- static
- {
- List<String> stopWords = Arrays.asList(new String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
- "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
- "ì´","ê·¸","ì ","ê²","ì","ë±","ë¤","ë°","ìì","ê·¸ë¦¬ê³ ","ê·¸ëì","ë","ëë"}
- );
-
- CharArraySet stopSet = new CharArraySet(Version.LUCENE_42, stopWords.size(), false);
-
- stopSet.addAll(stopWords);
- STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
- }
-
- public KoreanAnalyzer() {
- this(Version.LUCENE_42, STOP_WORDS_SET);
- }
-
- /**
- * ê²ìì ìí ííìë¶ì
- */
- public KoreanAnalyzer(boolean exactMatch) {
- this(Version.LUCENE_42, STOP_WORDS_SET);
- this.exactMatch = exactMatch;
- }
-
- public KoreanAnalyzer(Version matchVersion, String[] stopWords) throws IOException {
- this(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
- }
-
- public KoreanAnalyzer(Version matchVersion) throws IOException {
- this(matchVersion, STOP_WORDS_SET);
- }
-
- public KoreanAnalyzer(Version matchVersion, File stopwords) throws IOException {
- this(matchVersion, loadStopwordSet(stopwords, matchVersion));
- }
-
- public KoreanAnalyzer(Version matchVersion, File stopwords, String encoding) throws IOException {
- this(matchVersion, loadStopwordSet(stopwords, matchVersion));
- }
-
- public KoreanAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, loadStopwordSet(stopwords, matchVersion));
- }
-
- public KoreanAnalyzer(Version matchVersion, CharArraySet stopWords) {
- super(matchVersion, stopWords);
- replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_42);
- }
-
-
- @Override
- protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
- final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader);
- src.setMaxTokenLength(maxTokenLength);
- TokenStream tok = new KoreanFilter(src, bigrammable, hasOrigin, exactMatch, originCNoun);
- tok = new LowerCaseFilter(matchVersion, tok);
- tok = new StopFilter(matchVersion, tok, stopwords);
- return new TokenStreamComponents(src, tok) {
- @Override
- protected void setReader(final Reader reader) throws IOException {
- src.setMaxTokenLength(KoreanAnalyzer.this.maxTokenLength);
- super.setReader(reader);
- }
- };
- }
-
- /**
- * determine whether the bigram index term is returned or not if a input word is failed to analysis
- * If true is set, the bigram index term is returned. If false is set, the bigram index term is not returned.
- */
- public void setBigrammable(boolean is) {
- bigrammable = is;
- }
-
- /**
- * determin whether the original term is returned or not if a input word is analyzed morphically.
- */
- public void setHasOrigin(boolean has) {
- hasOrigin = has;
- }
-
- /**
- * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
- */
- public void setOriginCNoun(boolean cnoun) {
- originCNoun = cnoun;
- }
-
- /**
- * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
- */
- public void setExactMatch(boolean exact) {
- exactMatch = exact;
- }
-
+
+ /** Default maximum allowed token length */
+ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /**
+ * Specifies whether deprecated acronyms should be replaced with HOST type.
+ * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
+ */
+ private final boolean replaceInvalidAcronym;
+
+ private Set<String> stopSet;
+
+ private boolean bigrammable = true;
+
+ private boolean hasOrigin = true;
+
+ private boolean exactMatch = false;
+
+ private boolean originCNoun = true;
+
+ public static final String DIC_ENCODING = "UTF-8";
+
+ /** An unmodifiable set containing some common English words that are usually not
+ useful for searching. */
+ public static final CharArraySet STOP_WORDS_SET;
+
+
+ static {
+ List<String> stopWords = Arrays.asList(new String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
+ "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
+ "ì´","ê·¸","ì ","ê²","ì","ë±","ë¤","ë°","ìì","ê·¸ë¦¬ê³ ","ê·¸ëì","ë","ëë"}
+ );
+
+ CharArraySet stopSet = new CharArraySet(Version.LUCENE_42, stopWords.size(), false);
+
+ stopSet.addAll(stopWords);
+ STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+ }
+
+ public KoreanAnalyzer() {
+ this(Version.LUCENE_42, STOP_WORDS_SET);
+ }
+
+ /**
+ * ê²ìì ìí ííìë¶ì
+ */
+ public KoreanAnalyzer(boolean exactMatch) {
+ this(Version.LUCENE_42, STOP_WORDS_SET);
+ this.exactMatch = exactMatch;
+ }
+
+ public KoreanAnalyzer(Version matchVersion, String[] stopWords) throws IOException {
+ this(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
+ }
+
+ public KoreanAnalyzer(Version matchVersion) throws IOException {
+ this(matchVersion, STOP_WORDS_SET);
+ }
+
+ public KoreanAnalyzer(Version matchVersion, File stopwords) throws IOException {
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
+ }
+
+ public KoreanAnalyzer(Version matchVersion, File stopwords, String encoding) throws IOException {
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
+ }
+
+ public KoreanAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
+ }
+
+ public KoreanAnalyzer(Version matchVersion, CharArraySet stopWords) {
+ super(matchVersion, stopWords);
+ replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_42);
+ }
+
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+ final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader);
+ src.setMaxTokenLength(maxTokenLength);
+ TokenStream tok = new KoreanFilter(src, bigrammable, hasOrigin, exactMatch, originCNoun);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new TokenStreamComponents(src, tok) {
+ @Override
+ protected void setReader(final Reader reader) throws IOException {
+ src.setMaxTokenLength(KoreanAnalyzer.this.maxTokenLength);
+ super.setReader(reader);
+ }
+ };
+ }
+
+ /**
+ * determine whether the bigram index term is returned or not if a input word is failed to analysis
+ * If true is set, the bigram index term is returned. If false is set, the bigram index term is not returned.
+ */
+ public void setBigrammable(boolean is) {
+ bigrammable = is;
+ }
+
+ /**
+ * determin whether the original term is returned or not if a input word is analyzed morphically.
+ */
+ public void setHasOrigin(boolean has) {
+ hasOrigin = has;
+ }
+
+ /**
+ * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
+ */
+ public void setOriginCNoun(boolean cnoun) {
+ originCNoun = cnoun;
+ }
+
+ /**
+ * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
+ */
+ public void setExactMatch(boolean exact) {
+ exactMatch = exact;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilter.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilter.java Sun May 5 22:26:35 2013
@@ -47,396 +47,378 @@ import org.apache.lucene.analysis.tokena
public class KoreanFilter extends TokenFilter {
- private LinkedList<IndexWord> morphQueue;
-
- private MorphAnalyzer morph;
-
- private WordSpaceAnalyzer wsAnal;
-
- private boolean bigrammable = true;
-
- private boolean hasOrigin = true;
-
- private boolean originCNoun = true;
-
- private boolean exactMatch = false;
-
- private char[] curTermBuffer;
-
- private int curTermLength;
-
- private String curType;
-
- private String curSource;
-
- private int tokStart;
-
- private int hanStart = 0; // íê¸ì ìì ìì¹, ë³µí©ëª
ì¬ì¼ê²½ì°
-
- private int chStart = 0;
-
- private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
- private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
-
- public KoreanFilter(TokenStream input) {
- super(input);
- morphQueue = new LinkedList<IndexWord>();
- morph = new MorphAnalyzer();
- wsAnal = new WordSpaceAnalyzer();
- cnAnalyzer.setExactMach(false);
- }
-
- /**
- *
- * @param input input token stream
- * @param bigram Whether the bigram index term return or not.
- */
- public KoreanFilter(TokenStream input, boolean bigram) {
- this(input);
- bigrammable = bigram;
- }
-
- public KoreanFilter(TokenStream input, boolean bigram, boolean has) {
- this(input, bigram);
- hasOrigin = has;
- }
-
- public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match) {
- this(input, bigram,has);
- this.exactMatch = match;
- }
-
- public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun) {
- this(input, bigram,has, match);
- this.originCNoun = cnoun;
- }
-
- public final boolean incrementToken() throws IOException {
-
- if(curTermBuffer!=null&&morphQueue.size()>0) {
- setTermBufferByQueue(false);
- return true;
- }
-
- if(!input.incrementToken()) return false;
-
- curTermBuffer = termAtt.buffer().clone();
- curTermLength = termAtt.length();
- tokStart = offsetAtt.startOffset();
- curType = typeAtt.type();
-
- try {
- if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) {
- analysisKorean(new String(curTermBuffer,0,termAtt.length()));
- } else if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) {
- analysisChinese(new String(curTermBuffer,0,termAtt.length()));
- } else {
- analysisETC(new String(curTermBuffer,0,termAtt.length()));
- }
- }catch(MorphException e) {
- throw new IOException("Korean Filter MorphException\n"+e.getMessage());
- }
-
- if(morphQueue!=null&&morphQueue.size()>0) {
- setTermBufferByQueue(true);
- } else {
- return incrementToken();
- }
-
- return true;
-
- }
-
- /**
- * queueì ì ì¥ë ê°ì¼ë¡ bufferì ê°ì ë³µì¬íë¤.
- */
- private void setTermBufferByQueue(boolean isFirst) {
-
- clearAttributes();
+ private LinkedList<IndexWord> morphQueue;
+
+ private MorphAnalyzer morph;
+
+ private WordSpaceAnalyzer wsAnal;
+
+ private boolean bigrammable = true;
+
+ private boolean hasOrigin = true;
+
+ private boolean originCNoun = true;
+
+ private boolean exactMatch = false;
+
+ private char[] curTermBuffer;
+
+ private int curTermLength;
+
+ private String curType;
+
+ private String curSource;
+
+ private int tokStart;
+
+ private int hanStart = 0; // íê¸ì ìì ìì¹, ë³µí©ëª
ì¬ì¼ê²½ì°
+
+ private int chStart = 0;
+
+ private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+ public KoreanFilter(TokenStream input) {
+ super(input);
+ morphQueue = new LinkedList<IndexWord>();
+ morph = new MorphAnalyzer();
+ wsAnal = new WordSpaceAnalyzer();
+ cnAnalyzer.setExactMach(false);
+ }
+
+ /**
+ *
+ * @param input input token stream
+ * @param bigram Whether the bigram index term return or not.
+ */
+ public KoreanFilter(TokenStream input, boolean bigram) {
+ this(input);
+ bigrammable = bigram;
+ }
+
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has) {
+ this(input, bigram);
+ hasOrigin = has;
+ }
+
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match) {
+ this(input, bigram,has);
+ this.exactMatch = match;
+ }
+
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun) {
+ this(input, bigram,has, match);
+ this.originCNoun = cnoun;
+ }
+
+ public final boolean incrementToken() throws IOException {
+
+ if(curTermBuffer!=null&&morphQueue.size()>0) {
+ setTermBufferByQueue(false);
+ return true;
+ }
+
+ if(!input.incrementToken()) return false;
+
+ curTermBuffer = termAtt.buffer().clone();
+ curTermLength = termAtt.length();
+ tokStart = offsetAtt.startOffset();
+ curType = typeAtt.type();
+
+ try {
+ if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) {
+ analysisKorean(new String(curTermBuffer,0,termAtt.length()));
+ } else if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) {
+ analysisChinese(new String(curTermBuffer,0,termAtt.length()));
+ } else {
+ analysisETC(new String(curTermBuffer,0,termAtt.length()));
+ }
+ }catch(MorphException e) {
+ throw new IOException("Korean Filter MorphException\n"+e.getMessage());
+ }
+
+ if(morphQueue!=null&&morphQueue.size()>0) {
+ setTermBufferByQueue(true);
+ } else {
+ return incrementToken();
+ }
+
+ return true;
+
+ }
+
+ /**
+ * queueì ì ì¥ë ê°ì¼ë¡ bufferì ê°ì ë³µì¬íë¤.
+ */
+ private void setTermBufferByQueue(boolean isFirst) {
+
+ clearAttributes();
- IndexWord iw = morphQueue.removeFirst();
- int pos = iw.getOffset();
+ IndexWord iw = morphQueue.removeFirst();
+ int pos = iw.getOffset();
- termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length());
- offsetAtt.setOffset(tokStart+pos, tokStart + pos + iw.getWord().length());
+ termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length());
+ offsetAtt.setOffset(tokStart+pos, tokStart + pos + iw.getWord().length());
+
+ if(!isFirst && iw.getOffset()==0) {
+ posIncrAtt.setPositionIncrement(0);
+// posLenAtt.setPositionLength(iw.getWord().length());
+ }
- if(!isFirst && iw.getOffset()==0) {
- posIncrAtt.setPositionIncrement(0);
-// posLenAtt.setPositionLength(iw.getWord().length());
+ }
+
+ /**
+ * íê¸ì ë¶ìíë¤.
+ * @throws MorphException
+ */
+ private void analysisKorean(String input) throws MorphException {
+
+ List<AnalysisOutput> outputs = morph.analyze(input);
+ if(outputs.size()==0) return;
+
+ Map<String,IndexWord> map = new LinkedHashMap<String,IndexWord>();
+ if(hasOrigin) map.put(input, new IndexWord(input,0));
+
+ if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) {
+ extractKeyword(outputs, map);
+ } else {
+ try {
+ List<AnalysisOutput> list = wsAnal.analyze(input);
+
+ List<AnalysisOutput> results = new ArrayList<AnalysisOutput>();
+ if(list.size()>1) {
+ for(AnalysisOutput o : list) {
+ if(hasOrigin) map.put(o.getSource(), new IndexWord(o.getSource(),0));
+ results.addAll(morph.analyze(o.getSource()));
+ }
+ } else {
+ results.addAll(list);
}
+
+ extractKeyword(results, map);
+ } catch(Exception e) {
+ extractKeyword(outputs, map);
+ }
+ }
+
+ Iterator<String> iter = map.keySet().iterator();
+
+ while(iter.hasNext()) {
+ String text = iter.next();
+ if(text.length()<=1) continue;
+ morphQueue.add(map.get(text));
+ }
+
+ }
+
+ private void extractKeyword(List<AnalysisOutput> outputs, Map<String,IndexWord> map) throws MorphException {
+
+ for(AnalysisOutput output : outputs) {
+
+ if(output.getPos()!=PatternConstants.POS_VERB) {
+ if(originCNoun || (!originCNoun&&output.getCNounList().size()==0)) {
+ map.put(output.getStem(), new IndexWord(output.getStem(),0));
+ }
+// }else {
+// map.put(output.getStem()+"ë¤", new Integer(1));
+ }
+
+ if(exactMatch) continue;
+
+ if(output.getScore()>=AnalysisOutput.SCORE_COMPOUNDS) {
- }
-
- /**
- * íê¸ì ë¶ìíë¤.
- * @throws MorphException
- */
- private void analysisKorean(String input) throws MorphException {
-
- List<AnalysisOutput> outputs = morph.analyze(input);
- if(outputs.size()==0) return;
-
- Map<String,IndexWord> map = new LinkedHashMap<String,IndexWord>();
- if(hasOrigin) map.put(input, new IndexWord(input,0));
-
- if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS)
- {
- extractKeyword(outputs, map);
- }
- else
- {
- try
- {
- List<AnalysisOutput> list = wsAnal.analyze(input);
-
- List<AnalysisOutput> results = new ArrayList<AnalysisOutput>();
- if(list.size()>1) {
- for(AnalysisOutput o : list) {
- if(hasOrigin) map.put(o.getSource(), new IndexWord(o.getSource(),0));
- results.addAll(morph.analyze(o.getSource()));
- }
- } else {
- results.addAll(list);
- }
-
- extractKeyword(results, map);
-
- }
- catch(Exception e)
- {
- extractKeyword(outputs, map);
- }
-
- }
-
- Iterator<String> iter = map.keySet().iterator();
-
- while(iter.hasNext()) {
- String text = iter.next();
- if(text.length()<=1) continue;
- morphQueue.add(map.get(text));
- }
-
- }
-
- private void extractKeyword(List<AnalysisOutput> outputs, Map<String,IndexWord> map) throws MorphException {
-
- for(AnalysisOutput output : outputs) {
-
- if(output.getPos()!=PatternConstants.POS_VERB) {
- if(originCNoun || (!originCNoun&&output.getCNounList().size()==0)) {
- map.put(output.getStem(), new IndexWord(output.getStem(),0));
- }
-// }else {
-// map.put(output.getStem()+"ë¤", new Integer(1));
- }
-
- if(exactMatch) continue;
-
- if(output.getScore()>=AnalysisOutput.SCORE_COMPOUNDS) {
-
- List<CompoundEntry> cnouns = output.getCNounList();
- int start = 0;
-
- for(int jj=0;jj<cnouns.size();jj++) {
-
- CompoundEntry cnoun = cnouns.get(jj);
-
- if(cnoun.getWord().length()>1) map.put(cnoun.getWord(), new IndexWord(cnoun.getWord(), start));
-
- if(jj==0 && cnoun.getWord().length()==1) {
- map.put(cnoun.getWord()+cnouns.get(jj+1).getWord(), new IndexWord(cnoun.getWord(), start));
- } else if(jj>1 && cnoun.getWord().length()==1) {
- String iw = cnouns.get(jj-1).getWord()+cnoun.getWord();
- map.put(iw, new IndexWord(iw, start-cnouns.get(jj-1).getWord().length()));
- }
-
- start += cnoun.getWord().length();
- }
-
- } else if(bigrammable) {
- addBiagramToMap(output.getStem(),map);
- }
-
- }
-
- }
-
- private void addBiagramToMap(String input, Map<String,IndexWord> map) {
-
- int offset = 0;
- int strlen = input.length();
-
- while(offset<strlen-1) {
-
- if(isAlphaNumChar(input.charAt(offset))) {
- String text = findAlphaNumeric(input.substring(offset));
- map.put(text, new IndexWord(text, offset));
- offset += text.length();
- } else {
- String text = input.substring(offset,
- offset+2>strlen?strlen:offset+2);
- map.put(text, new IndexWord(text, offset));
- offset++;
- }
-
- }
- }
-
- private String findAlphaNumeric(String text) {
- int pos = 0;
- for(int i=0;i<text.length();i++) {
- if(!isAlphaNumChar(text.charAt(i))) break;
- pos++;
- }
- return text.substring(0,pos);
- }
-
- /**
- * íìë 2ê°ì´ìì íê¸ ìì¼ë¡ ì½íì§ ì ìë¤.
- * ëìë²ì¹ì´ ìë.
- * @param term
- * @throws MorphException
- */
- private void analysisChinese(String term) throws MorphException {
-
- morphQueue.add(new IndexWord(term,0));
- if(term.length()<2) return; // 1ê¸ì íìë ìì¸ì´ë¡ íê¸ì ì¶ì¶íì§ ìëë¤.
-
- List<StringBuffer> candiList = new ArrayList<StringBuffer>();
- candiList.add(new StringBuffer());
-
-
-
- for(int i=0;i<term.length();i++) {
-
- char[] chs = HanjaUtils.convertToHangul(term.charAt(i));
- if(chs==null) continue;
-
- List<StringBuffer> removeList = new ArrayList<StringBuffer>(); // ì ê±°ë í보를 ì ì¥
-
- int caniSize = candiList.size();
-
- for(int j=0;j<caniSize;j++) {
- String origin = candiList.get(j).toString();
-
- for(int k=0;k<chs.length;k++) { // ì¶ê°ë¡ ìì±ë ìì ëí´ì ìë¡ì´ í
ì¤í¸ë¥¼ ìì±íë¤.
-
- if(k==4) break; // 4ê° ì´ìì ìì ê°ì§ê³ ìë ê²½ì° ì²«ë²ì§¸ ìì¼ë¡ë§ ì²ë¦¬ë¥¼ íë¤.
-
- StringBuffer sb = candiList.get(j);
- if(k>0) sb = new StringBuffer(origin);
-
- sb.append(chs[k]);
- if(k>0) candiList.add(sb);
-
- Iterator iter = DictionaryUtil.findWithPrefix(sb.toString());
- if(!iter.hasNext()) // ì¬ì ì ìì¼ë©´ ìì íë³´
- removeList.add(sb);
-
- }
-
- }
-
- if(removeList.size()==candiList.size()) { // ì¬ì ìì ì°¾ì ë¨ì´ê° íëë ìë¤ë©´..
- candiList = candiList.subList(0, 1); // 첫ë²ì§¸ë§ ìì±íê³ ë머ì§ë ë²ë¦¼
- }
-
- for(StringBuffer rsb : removeList) {
- if(candiList.size()>1) candiList.remove(rsb);
- }
-
- }
-
- int maxCandidate = 5;
- if(candiList.size()<maxCandidate) maxCandidate=candiList.size();
-
- for(int i=0;i<maxCandidate;i++) {
- morphQueue.add(new IndexWord(candiList.get(i).toString(),0));
- }
-
- Map<String, String> cnounMap = new HashMap<String, String>();
-
- // ì¶ì¶ë ëª
ì¬ê° ë³µí©ëª
ì¬ì¸ ê²½ì° ë¶ë¦¬íë¤.
- for(int i=0;i<maxCandidate;i++) {
- List<CompoundEntry> results = confirmCNoun(candiList.get(i).toString());
-
- int pos = 0;
- int offset = 0;
- for(CompoundEntry entry : results) {
- pos += entry.getWord().length();
- if(cnounMap.get(entry.getWord())!=null) continue;
-
- // íê¸ê³¼ 매ì¹ëë íì를 짤ë¼ì íì ì ì¥íë¤.
- morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
-
- cnounMap.put(entry.getWord(), entry.getWord());
-
- if(entry.getWord().length()<2) continue; // íê¸ì 2ê¸ì ì´ìë§ ì ì¥íë¤.
-
- // ë¶ë¦¬ë íê¸ì íì ì ì¥íë¤.
- morphQueue.add(new IndexWord(entry.getWord(),offset));
-
- offset = pos;
- }
- }
- }
-
- private List<CompoundEntry> confirmCNoun(String input) throws MorphException {
-
- WordEntry cnoun = DictionaryUtil.getCNoun(input);
- if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') {
- return cnoun.getCompounds();
- }
-
- return cnAnalyzer.analyze(input);
-
- }
-
- private void analysisETC(String term) throws MorphException {
-
- final char[] buffer = termAtt.buffer();
- final int bufferLength = termAtt.length();
- final String type = typeAtt.type();
-
- if (type == APOSTROPHE_TYPE && // remove 's
- bufferLength >= 2 &&
- buffer[bufferLength-2] == '\'' &&
- (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
- // Strip last 2 characters off
- morphQueue.add(new IndexWord(term.substring(0,bufferLength - 2),0));
- } else if (type == ACRONYM_TYPE) { // remove dots
- int upto = 0;
- for(int i=0;i<bufferLength;i++) {
- char c = buffer[i];
- if (c != '.')
- buffer[upto++] = c;
- }
- morphQueue.add(new IndexWord(term.substring(0,upto),0));
- } else {
- morphQueue.add(new IndexWord(term,0));
- }
-
- }
-
- private boolean isAlphaNumChar(int c) {
- if((c>=48&&c<=57)||(c>=65&&c<=122)) return true;
- return false;
- }
-
- public void setHasOrigin(boolean has) {
- hasOrigin = has;
- }
-
- public void setExactMatch(boolean match) {
- this.exactMatch = match;
- }
+ List<CompoundEntry> cnouns = output.getCNounList();
+ int start = 0;
+
+ for(int jj=0;jj<cnouns.size();jj++) {
+
+ CompoundEntry cnoun = cnouns.get(jj);
+
+ if(cnoun.getWord().length()>1) map.put(cnoun.getWord(), new IndexWord(cnoun.getWord(), start));
+
+ if(jj==0 && cnoun.getWord().length()==1) {
+ map.put(cnoun.getWord()+cnouns.get(jj+1).getWord(), new IndexWord(cnoun.getWord(), start));
+ } else if(jj>1 && cnoun.getWord().length()==1) {
+ String iw = cnouns.get(jj-1).getWord()+cnoun.getWord();
+ map.put(iw, new IndexWord(iw, start-cnouns.get(jj-1).getWord().length()));
+ }
+
+ start += cnoun.getWord().length();
+ }
+
+ } else if(bigrammable) {
+ addBiagramToMap(output.getStem(),map);
+ }
+ }
+ }
+
+ private void addBiagramToMap(String input, Map<String,IndexWord> map) {
+
+ int offset = 0;
+ int strlen = input.length();
+
+ while(offset<strlen-1) {
+
+ if(isAlphaNumChar(input.charAt(offset))) {
+ String text = findAlphaNumeric(input.substring(offset));
+ map.put(text, new IndexWord(text, offset));
+ offset += text.length();
+ } else {
+ String text = input.substring(offset,
+ offset+2>strlen?strlen:offset+2);
+ map.put(text, new IndexWord(text, offset));
+ offset++;
+ }
+ }
+ }
+
+ private String findAlphaNumeric(String text) {
+ int pos = 0;
+ for(int i=0;i<text.length();i++) {
+ if(!isAlphaNumChar(text.charAt(i))) break;
+ pos++;
+ }
+ return text.substring(0,pos);
+ }
+
+ /**
+ * íìë 2ê°ì´ìì íê¸ ìì¼ë¡ ì½íì§ ì ìë¤.
+ * ëìë²ì¹ì´ ìë.
+ * @param term
+ * @throws MorphException
+ */
+ private void analysisChinese(String term) throws MorphException {
+
+ morphQueue.add(new IndexWord(term,0));
+ if(term.length()<2) return; // 1ê¸ì íìë ìì¸ì´ë¡ íê¸ì ì¶ì¶íì§ ìëë¤.
+
+ List<StringBuffer> candiList = new ArrayList<StringBuffer>();
+ candiList.add(new StringBuffer());
+
+ for(int i=0;i<term.length();i++) {
+
+ char[] chs = HanjaUtils.convertToHangul(term.charAt(i));
+ if(chs==null) continue;
+
+ List<StringBuffer> removeList = new ArrayList<StringBuffer>(); // ì ê±°ë í보를 ì ì¥
+
+ int caniSize = candiList.size();
+
+ for(int j=0;j<caniSize;j++) {
+ String origin = candiList.get(j).toString();
+
+ for(int k=0;k<chs.length;k++) { // ì¶ê°ë¡ ìì±ë ìì ëí´ì ìë¡ì´ í
ì¤í¸ë¥¼ ìì±íë¤.
+
+ if(k==4) break; // 4ê° ì´ìì ìì ê°ì§ê³ ìë ê²½ì° ì²«ë²ì§¸ ìì¼ë¡ë§ ì²ë¦¬ë¥¼ íë¤.
+
+ StringBuffer sb = candiList.get(j);
+ if(k>0) sb = new StringBuffer(origin);
+
+ sb.append(chs[k]);
+ if(k>0) candiList.add(sb);
+
+ Iterator iter = DictionaryUtil.findWithPrefix(sb.toString());
+ if(!iter.hasNext()) // ì¬ì ì ìì¼ë©´ ìì íë³´
+ removeList.add(sb);
+ }
+ }
+
+ if(removeList.size()==candiList.size()) { // ì¬ì ìì ì°¾ì ë¨ì´ê° íëë ìë¤ë©´..
+ candiList = candiList.subList(0, 1); // 첫ë²ì§¸ë§ ìì±íê³ ë머ì§ë ë²ë¦¼
+ }
+
+ for(StringBuffer rsb : removeList) {
+ if(candiList.size()>1) candiList.remove(rsb);
+ }
+ }
+
+ int maxCandidate = 5;
+ if(candiList.size()<maxCandidate) maxCandidate=candiList.size();
+
+ for(int i=0;i<maxCandidate;i++) {
+ morphQueue.add(new IndexWord(candiList.get(i).toString(),0));
+ }
+
+ Map<String, String> cnounMap = new HashMap<String, String>();
+
+ // ì¶ì¶ë ëª
ì¬ê° ë³µí©ëª
ì¬ì¸ ê²½ì° ë¶ë¦¬íë¤.
+ for(int i=0;i<maxCandidate;i++) {
+ List<CompoundEntry> results = confirmCNoun(candiList.get(i).toString());
+
+ int pos = 0;
+ int offset = 0;
+ for(CompoundEntry entry : results) {
+ pos += entry.getWord().length();
+ if(cnounMap.get(entry.getWord())!=null) continue;
+
+ // íê¸ê³¼ 매ì¹ëë íì를 짤ë¼ì íì ì ì¥íë¤.
+ morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
+
+ cnounMap.put(entry.getWord(), entry.getWord());
+
+ if(entry.getWord().length()<2) continue; // íê¸ì 2ê¸ì ì´ìë§ ì ì¥íë¤.
+
+ // ë¶ë¦¬ë íê¸ì íì ì ì¥íë¤.
+ morphQueue.add(new IndexWord(entry.getWord(),offset));
+
+ offset = pos;
+ }
+ }
+ }
+
+ private List<CompoundEntry> confirmCNoun(String input) throws MorphException {
+
+ WordEntry cnoun = DictionaryUtil.getCNoun(input);
+ if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') {
+ return cnoun.getCompounds();
+ }
+
+ return cnAnalyzer.analyze(input);
+ }
+
+ private void analysisETC(String term) throws MorphException {
+
+ final char[] buffer = termAtt.buffer();
+ final int bufferLength = termAtt.length();
+ final String type = typeAtt.type();
+
+ if (type == APOSTROPHE_TYPE && // remove 's
+ bufferLength >= 2 &&
+ buffer[bufferLength-2] == '\'' &&
+ (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+ // Strip last 2 characters off
+ morphQueue.add(new IndexWord(term.substring(0,bufferLength - 2),0));
+ } else if (type == ACRONYM_TYPE) { // remove dots
+ int upto = 0;
+ for(int i=0;i<bufferLength;i++) {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ morphQueue.add(new IndexWord(term.substring(0,upto),0));
+ } else {
+ morphQueue.add(new IndexWord(term,0));
+ }
+ }
+
+ private boolean isAlphaNumChar(int c) {
+ if((c>=48&&c<=57)||(c>=65&&c<=122)) return true;
+ return false;
+ }
+
+ public void setHasOrigin(boolean has) {
+ hasOrigin = has;
+ }
+
+ public void setExactMatch(boolean match) {
+ this.exactMatch = match;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilterFactory.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanFilterFactory.java Sun May 5 22:26:35 2013
@@ -24,13 +24,13 @@ import org.apache.lucene.analysis.util.T
public class KoreanFilterFactory extends TokenFilterFactory {
- private boolean bigrammable = true;
-
- private boolean hasOrigin = true;
-
- private boolean hasCNoun = true;
-
- private boolean exactMatch = false;
+ private boolean bigrammable = true;
+
+ private boolean hasOrigin = true;
+
+ private boolean hasCNoun = true;
+
+ private boolean exactMatch = false;
/**
* Initialize this factory via a set of key-value pairs.
@@ -42,33 +42,33 @@ public class KoreanFilterFactory extends
public void init(Map<String, String> args) {
-// bigrammable = getBoolean("bigrammable", true);
-// hasOrigin = getBoolean("hasOrigin", true);
-// exactMatch = getBoolean("exactMatch", false);
-// hasCNoun = getBoolean("hasCNoun", true);
- }
-
- public TokenStream create(TokenStream tokenstream) {
- return new KoreanFilter(tokenstream, bigrammable, hasOrigin, exactMatch, hasCNoun);
- }
-
- public void setBigrammable(boolean bool) {
- this.bigrammable = bool;
- }
-
- public void setHasOrigin(boolean bool) {
- this.hasOrigin = bool;
- }
-
- public void setHasCNoun(boolean bool) {
- this.hasCNoun = bool;
- }
-
- /**
- * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
-// * @param has
- */
- public void setExactMatch(boolean bool) {
- exactMatch = bool;
- }
+// bigrammable = getBoolean("bigrammable", true);
+// hasOrigin = getBoolean("hasOrigin", true);
+// exactMatch = getBoolean("exactMatch", false);
+// hasCNoun = getBoolean("hasCNoun", true);
+ }
+
+ public TokenStream create(TokenStream tokenstream) {
+ return new KoreanFilter(tokenstream, bigrammable, hasOrigin, exactMatch, hasCNoun);
+ }
+
+ public void setBigrammable(boolean bool) {
+ this.bigrammable = bool;
+ }
+
+ public void setHasOrigin(boolean bool) {
+ this.hasOrigin = bool;
+ }
+
+ public void setHasCNoun(boolean bool) {
+ this.hasCNoun = bool;
+ }
+
+ /**
+ * determin whether the original compound noun is returned or not if a input word is analyzed morphically.
+ // * @param has
+ */
+ public void setExactMatch(boolean bool) {
+ exactMatch = bool;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java Sun May 5 22:26:35 2013
@@ -29,135 +29,133 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.util.Version;
public class KoreanTokenizer extends Tokenizer {
-
- /** A private instance of the JFlex-constructed scanner */
- private final KoreanTokenizerImpl scanner;
-
- public static final int ALPHANUM = 0;
- public static final int APOSTROPHE = 1;
- public static final int ACRONYM = 2;
- public static final int COMPANY = 3;
- public static final int EMAIL = 4;
- public static final int HOST = 5;
- public static final int NUM = 6;
- public static final int CJ = 7;
- public static final int ACRONYM_DEP = 8;
- public static final int KOREAN = 9;
- public static final int CHINESE = 10;
-
-
- /** String token types that correspond to token type int constants */
- public static final String [] TOKEN_TYPES = new String [] {
- "<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
- "<NUM>",
- "<CJ>",
- "<ACRONYM_DEP>",
- "<KOREAN>",
- "<CHINESE>"
- };
-
- private boolean replaceInvalidAcronym;
-
- private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-
- /** Set the max allowed token length. Any token longer
- * than this is skipped. */
- public void setMaxTokenLength(int length) {
- this.maxTokenLength = length;
- }
-
- /** @see #setMaxTokenLength */
- public int getMaxTokenLength() {
- return maxTokenLength;
- }
-
- /**
- * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
- * the <code>input</code> to the newly created JFlex scanner.
- *
- * @param input The input reader
- *
- * See http://issues.apache.org/jira/browse/LUCENE-1068
- */
- public KoreanTokenizer(Version matchVersion, Reader input) {
- super(input);
- this.scanner = new KoreanTokenizerImpl(input);
- init(input, matchVersion);
- }
-
- /**
- * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
- */
- public KoreanTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
- super(factory, input);
- this.scanner = new KoreanTokenizerImpl(input);
- init(input, matchVersion);
- }
-
- private final void init(Reader input, Version matchVersion) {
- if (matchVersion.onOrAfter(Version.LUCENE_42)) {
- replaceInvalidAcronym = true;
- } else {
- replaceInvalidAcronym = false;
- }
- this.input = input;
- }
-
- // this tokenizer generates three attributes:
- // term offset, positionIncrement and type
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
- @Override
- public final boolean incrementToken() throws IOException {
- clearAttributes();
- int posIncr = 1;
-
- while(true) {
- int tokenType = scanner.getNextToken();
-
- if (tokenType == KoreanTokenizerImpl.YYEOF) {
- return false;
- }
-
- if (scanner.yylength() <= maxTokenLength) {
- posIncrAtt.setPositionIncrement(posIncr);
- scanner.getText(termAtt);
- final int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
- typeAtt.setType(KoreanTokenizer.TOKEN_TYPES[tokenType]);
-
- return true;
- } else
- // When we skip a too-long term, we still increment the
- // position increment
- posIncr++;
- }
- }
-
- @Override
- public final void end() {
- // set final offset
- int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
- offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
- @Override
- public void reset() throws IOException {
- scanner.yyreset(input);
- }
-
-
+
+ /** A private instance of the JFlex-constructed scanner */
+ private final KoreanTokenizerImpl scanner;
+
+ public static final int ALPHANUM = 0;
+ public static final int APOSTROPHE = 1;
+ public static final int ACRONYM = 2;
+ public static final int COMPANY = 3;
+ public static final int EMAIL = 4;
+ public static final int HOST = 5;
+ public static final int NUM = 6;
+ public static final int CJ = 7;
+ public static final int ACRONYM_DEP = 8;
+ public static final int KOREAN = 9;
+ public static final int CHINESE = 10;
+
+
+ /** String token types that correspond to token type int constants */
+ public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>",
+ "<KOREAN>",
+ "<CHINESE>"
+ };
+
+ private boolean replaceInvalidAcronym;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /** Set the max allowed token length. Any token longer
+ * than this is skipped. */
+ public void setMaxTokenLength(int length) {
+ this.maxTokenLength = length;
+ }
+
+ /** @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ /**
+ * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
+ * the <code>input</code> to the newly created JFlex scanner.
+ *
+ * @param input The input reader
+ *
+ * See http://issues.apache.org/jira/browse/LUCENE-1068
+ */
+ public KoreanTokenizer(Version matchVersion, Reader input) {
+ super(input);
+ this.scanner = new KoreanTokenizerImpl(input);
+ init(input, matchVersion);
+ }
+
+ /**
+ * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
+ */
+ public KoreanTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
+ super(factory, input);
+ this.scanner = new KoreanTokenizerImpl(input);
+ init(input, matchVersion);
+ }
+
+ private final void init(Reader input, Version matchVersion) {
+ if (matchVersion.onOrAfter(Version.LUCENE_42)) {
+ replaceInvalidAcronym = true;
+ } else {
+ replaceInvalidAcronym = false;
+ }
+ this.input = input;
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ int posIncr = 1;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == KoreanTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(posIncr);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ typeAtt.setType(KoreanTokenizer.TOKEN_TYPES[tokenType]);
+
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ @Override
+ public final void end() {
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ scanner.yyreset(input);
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerFactory.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerFactory.java Sun May 5 22:26:35 2013
@@ -27,7 +27,7 @@ import org.apache.lucene.util.Version;
public class KoreanTokenizerFactory extends TokenizerFactory {
- private Version version;
+ private Version version;
/**
* Initialize this factory via a set of key-value pairs.
@@ -41,17 +41,17 @@ public class KoreanTokenizerFactory exte
return null; //To change body of implemented methods use File | Settings | File Templates.
}
-// public KoreanTokenizerFactory() {
-// version = Version.LUCENE_42;
-// }
+// public KoreanTokenizerFactory() {
+// version = Version.LUCENE_42;
+// }
//
//
-// public KoreanTokenizerFactory(Version v) {
-// version = v;
-// }
-//
-// public Tokenizer create(Reader input) {
-// return new KoreanTokenizer(version, input);
-// }
+// public KoreanTokenizerFactory(Version v) {
+// version = v;
+// }
+//
+// public Tokenizer create(Reader input) {
+// return new KoreanTokenizer(version, input);
+// }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex Sun May 5 22:26:35 2013
@@ -61,9 +61,8 @@ public static final String [] TOKEN_TYPE
"<CHINESE>"
};
-public final int yychar()
-{
- return yychar;
+public final int yychar() {
+ return yychar;
}
/**
@@ -75,68 +74,68 @@ final void getText(CharTermAttribute t)
%}
// korean word: a sequence of digits & letters &
-KOREAN = ({LETTER}|{NUM}|{DIGIT})* {HANLETTER}+ ({LETTER}|{DIGIT})*
+KOREAN = ({LETTER}|{NUM}|{DIGIT})* {HANLETTER}+ ({LETTER}|{DIGIT})*
// basic word: a sequence of digits & letters
-ALPHANUM = ({LETTER}|{DIGIT})+
+ALPHANUM = ({LETTER}|{DIGIT})+
// chinese word: a sequence of digits & letters &
-CHINESE = {CHINESELETTER}+ ({LETTER}|{DIGIT})*
+CHINESE = {CHINESELETTER}+ ({LETTER}|{DIGIT})*
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
-APOSTROPHE = {ALPHA} ("'" {ALPHA})+
+APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
-ACRONYM = {LETTER} "." ({LETTER} ".")+
+ACRONYM = {LETTER} "." ({LETTER} ".")+
-ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
+ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
-COMPANY = {ALPHA} ("&"|"@") {ALPHA}
+COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
-EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
-HOST = {ALPHANUM} ((".") {ALPHANUM})+
+HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
-NUM = ({ALPHANUM} {P} {HAS_DIGIT}
- | "." {DIGIT}
- | {DIGIT} "."
- | {HAS_DIGIT} {P} {ALPHANUM}
- | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
- | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
- | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
- | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+NUM = ({ALPHANUM} {P} {HAS_DIGIT}
+ | "." {DIGIT}
+ | {DIGIT} "."
+ | {HAS_DIGIT} {P} {ALPHANUM}
+ | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
-P = ("_"|"-"|"/"|"."|",")
+P = ("_"|"-"|"/"|"."|",")
// at least one digit
-HAS_DIGIT =
+HAS_DIGIT =
({LETTER}|{DIGIT})*
{DIGIT}
({LETTER}|{DIGIT})*
-ALPHA = ({LETTER})+
+ALPHA = ({LETTER})+
-LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
-DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
HANLETTER = [\uac00-\ud7af\u1100-\u11ff]
-CHINESELETTER = [\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]
+CHINESELETTER = [\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]
// Chinese, Japanese
-CJ = ([\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\uff65-\uff9f])+
+CJ = ([\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\uff65-\uff9f])+
-WHITESPACE = \r\n | [ \r\n\t\f]
+WHITESPACE = \r\n | [ \r\n\t\f]
%%
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutput.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutput.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutput.java Sun May 5 22:26:35 2013
@@ -24,220 +24,219 @@ import org.apache.lucene.analysis.kr.uti
public class AnalysisOutput implements Cloneable {
- public static final int SCORE_CORRECT = 100;
- public static final int SCORE_COMPOUNDS = 70;
- public static final int SCORE_ANALYSIS = 30;
- public static final int SCORE_CANDIDATE = 10;
- public static final int SCORE_FAIL = 0;
-
- private String source; //ë¶ìí기 ì 문ìì´(ëìì°ê¸° 모ëìì ì¬ì©ëë¤.)
- private int score; // score of this result
- private int patn; // word pattern
- private char type; // type of input word
- private List<CompoundEntry> compound = new ArrayList<CompoundEntry>(); // compound noun of input word
- private String stem;
- private char pos; // 3 simplified stem type
- private char pos2; // pos attr. for 'pos'
- private char dinf; // pos info. in Han-dic
- private String nsfx; // index of noun suffix
- private String josa; // josa string
- private List<String> jlist = new ArrayList<String>(); // unit-josa sequence
- private String eomi; // Eomi string
- private List<String> elist = new ArrayList<String>(); // unit-Eomi sequence
- private String pomi; // prefinal Eomi
- private String xverb; // Xverb string
- private String vsfx; // verb suffix
- private char vtype; // irregular type
-
- public AnalysisOutput() {
- this.score = SCORE_FAIL;
- }
-
- public AnalysisOutput(String stem, String josa, String eomi, int patn) {
- this.score = SCORE_ANALYSIS;
- this.stem=stem;
- this.josa = josa;
- this.eomi = eomi;
- this.patn = patn;
- }
-
- public AnalysisOutput(String stem, String josa, String eomi, int patn, int score) {
- this(stem,josa,eomi,patn);
- this.score = score;
- }
-
- public AnalysisOutput(String stem, String josa, String eomi, char pos, int patn, int score) {
- this(stem,josa,eomi,patn,score);
- this.pos = pos;
- }
-
- public void setScore(int i) {
- this.score = i;
- }
- public void setPatn(int i) {
- this.patn = i;
- }
- public void setType(char c) {
- this.type = c;
- }
-
- public void setStem(String s) {
- this.stem = s;
- }
-
-
- public void setPos(char c) {
- this.pos = c;
- }
-
- public void setPos2(char c){
- this.pos2 = c;
- }
-
- public void setDinf(char c){
- this.dinf = c;
- }
-
- public void setNsfx(String s) {
- this.nsfx = s;
- }
-
- public void setJosa(String s) {
- this.josa = s;
- }
-
- public void addJlist(String l) {
- this.jlist.add(l);
- }
-
- public void setEomi(String s){
- this.eomi = s;
- }
-
- public void addElist(String l){
- this.elist.add(l);
- }
-
- public void setElist(String l, int index){
- this.elist.set(index,l);
- }
-
- public void setPomi(String s) {
- this.pomi = s;
- }
- public void setXverb(String s){
- this.xverb=s;
- }
- public void setVsfx(String s) {
- this.vsfx = s;
- }
- public void setVtype(char c) {
- this.vtype = c;
- }
-
- public int getScore() {
- return this.score;
- }
- public int getPatn() {
- return this.patn;
- }
-
- public char getType() {
- return this.type;
- }
- public String getStem() {
- return stem;
- }
- public char getPos() {
- return this.pos;
- }
- public char getPos2() {
- return this.pos2;
- }
- public char getDinf() {
- return this.dinf;
- }
- public String getNsfx() {
- return this.nsfx;
- }
- public String getJosa() {
- return this.josa;
- }
- public List<String> getJlist() {
- return this.jlist;
- }
- public String getEomi() {
- return this.eomi;
- }
- public List<String> getElist() {
- return this.elist;
- }
- public String getPomi(){
- return this.pomi;
- }
- public String getXverb() {
- return this.xverb;
- }
- public String getVsfx() {
- return this.vsfx;
- }
- public char getVtype() {
- return this.vtype;
- }
-
- public void addCNoun(CompoundEntry w) {
- compound.add(w);
- }
-
- public List<CompoundEntry> getCNounList() {
- return compound;
- }
-
- public void setCNoun(List cnoun) {
- compound = cnoun;
- }
-
- public void addCNoun(List cnoun) {
- compound.addAll(cnoun);
- }
-
- /**
- * @return the source
- */
- public String getSource() {
- return source;
- }
-
- /**
- * @param source the source to set
- */
- public void setSource(String source) {
- this.source = source;
- }
-
- public AnalysisOutput clone() throws CloneNotSupportedException {
- AnalysisOutput output = (AnalysisOutput)super.clone();
-
- output.setDinf(this.dinf);
- output.setEomi(this.eomi);
- output.setJosa(this.josa);
- output.setNsfx(this.nsfx);
- output.setPatn(this.patn);
- output.setPomi(this.pomi);
- output.setPos(this.pos);
- output.setPos2(this.pos2);
- output.setScore(this.score);
- output.setStem(this.stem);
- output.setType(this.type);
- output.setVsfx(this.vsfx);
- output.setVtype(this.vtype);
- output.setXverb(this.xverb);
-
- return output;
- }
-
- public String toString() {
- return Utilities.buildOutputString(this);
- }
-
+ public static final int SCORE_CORRECT = 100;
+ public static final int SCORE_COMPOUNDS = 70;
+ public static final int SCORE_ANALYSIS = 30;
+ public static final int SCORE_CANDIDATE = 10;
+ public static final int SCORE_FAIL = 0;
+
+ private String source; //ë¶ìí기 ì 문ìì´(ëìì°ê¸° 모ëìì ì¬ì©ëë¤.)
+ private int score; // score of this result
+ private int patn; // word pattern
+ private char type; // type of input word
+ private List<CompoundEntry> compound = new ArrayList<CompoundEntry>(); // compound noun of input word
+ private String stem;
+ private char pos; // 3 simplified stem type
+ private char pos2; // pos attr. for 'pos'
+ private char dinf; // pos info. in Han-dic
+ private String nsfx; // index of noun suffix
+ private String josa; // josa string
+ private List<String> jlist = new ArrayList<String>(); // unit-josa sequence
+ private String eomi; // Eomi string
+ private List<String> elist = new ArrayList<String>(); // unit-Eomi sequence
+ private String pomi; // prefinal Eomi
+ private String xverb; // Xverb string
+ private String vsfx; // verb suffix
+ private char vtype; // irregular type
+
+ public AnalysisOutput() {
+ this.score = SCORE_FAIL;
+ }
+
+ public AnalysisOutput(String stem, String josa, String eomi, int patn) {
+ this.score = SCORE_ANALYSIS;
+ this.stem=stem;
+ this.josa = josa;
+ this.eomi = eomi;
+ this.patn = patn;
+ }
+
+ public AnalysisOutput(String stem, String josa, String eomi, int patn, int score) {
+ this(stem,josa,eomi,patn);
+ this.score = score;
+ }
+
+ public AnalysisOutput(String stem, String josa, String eomi, char pos, int patn, int score) {
+ this(stem,josa,eomi,patn,score);
+ this.pos = pos;
+ }
+
+ public void setScore(int i) {
+ this.score = i;
+ }
+ public void setPatn(int i) {
+ this.patn = i;
+ }
+ public void setType(char c) {
+ this.type = c;
+ }
+
+ public void setStem(String s) {
+ this.stem = s;
+ }
+
+
+ public void setPos(char c) {
+ this.pos = c;
+ }
+
+ public void setPos2(char c){
+ this.pos2 = c;
+ }
+
+ public void setDinf(char c){
+ this.dinf = c;
+ }
+
+ public void setNsfx(String s) {
+ this.nsfx = s;
+ }
+
+ public void setJosa(String s) {
+ this.josa = s;
+ }
+
+ public void addJlist(String l) {
+ this.jlist.add(l);
+ }
+
+ public void setEomi(String s){
+ this.eomi = s;
+ }
+
+ public void addElist(String l){
+ this.elist.add(l);
+ }
+
+ public void setElist(String l, int index){
+ this.elist.set(index,l);
+ }
+
+ public void setPomi(String s) {
+ this.pomi = s;
+ }
+ public void setXverb(String s){
+ this.xverb=s;
+ }
+ public void setVsfx(String s) {
+ this.vsfx = s;
+ }
+ public void setVtype(char c) {
+ this.vtype = c;
+ }
+
+ public int getScore() {
+ return this.score;
+ }
+ public int getPatn() {
+ return this.patn;
+ }
+
+ public char getType() {
+ return this.type;
+ }
+ public String getStem() {
+ return stem;
+ }
+ public char getPos() {
+ return this.pos;
+ }
+ public char getPos2() {
+ return this.pos2;
+ }
+ public char getDinf() {
+ return this.dinf;
+ }
+ public String getNsfx() {
+ return this.nsfx;
+ }
+ public String getJosa() {
+ return this.josa;
+ }
+ public List<String> getJlist() {
+ return this.jlist;
+ }
+ public String getEomi() {
+ return this.eomi;
+ }
+ public List<String> getElist() {
+ return this.elist;
+ }
+ public String getPomi(){
+ return this.pomi;
+ }
+ public String getXverb() {
+ return this.xverb;
+ }
+ public String getVsfx() {
+ return this.vsfx;
+ }
+ public char getVtype() {
+ return this.vtype;
+ }
+
+ public void addCNoun(CompoundEntry w) {
+ compound.add(w);
+ }
+
+ public List<CompoundEntry> getCNounList() {
+ return compound;
+ }
+
+ public void setCNoun(List cnoun) {
+ compound = cnoun;
+ }
+
+ public void addCNoun(List cnoun) {
+ compound.addAll(cnoun);
+ }
+
+ /**
+ * @return the source
+ */
+ public String getSource() {
+ return source;
+ }
+
+ /**
+ * @param source the source to set
+ */
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public AnalysisOutput clone() throws CloneNotSupportedException {
+ AnalysisOutput output = (AnalysisOutput)super.clone();
+
+ output.setDinf(this.dinf);
+ output.setEomi(this.eomi);
+ output.setJosa(this.josa);
+ output.setNsfx(this.nsfx);
+ output.setPatn(this.patn);
+ output.setPomi(this.pomi);
+ output.setPos(this.pos);
+ output.setPos2(this.pos2);
+ output.setScore(this.score);
+ output.setStem(this.stem);
+ output.setType(this.type);
+ output.setVsfx(this.vsfx);
+ output.setVtype(this.vtype);
+ output.setXverb(this.xverb);
+
+ return output;
+ }
+
+ public String toString() {
+ return Utilities.buildOutputString(this);
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java Sun May 5 22:26:35 2013
@@ -20,25 +20,25 @@ package org.apache.lucene.analysis.kr.mo
import java.util.Comparator;
public class AnalysisOutputComparator<T> implements Comparator<T> {
- public int compare(T o1, T o2) {
-
- AnalysisOutput out1 = (AnalysisOutput)o1;
- AnalysisOutput out2 = (AnalysisOutput)o2;
-
- int score = out2.getScore()-out1.getScore();
- int pattern = out2.getPatn()-out1.getPatn();
- int len = out1.getStem().length()-out2.getStem().length();
-
- if(score!=0) return score;
-
- if(out2.getScore()==AnalysisOutput.SCORE_CORRECT &&
- out1.getScore()==AnalysisOutput.SCORE_CORRECT) {
- pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern;
- pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern;
- }
-
- if(pattern!=0) return pattern;
-
- return len;
- }
+ public int compare(T o1, T o2) {
+
+ AnalysisOutput out1 = (AnalysisOutput)o1;
+ AnalysisOutput out2 = (AnalysisOutput)o2;
+
+ int score = out2.getScore()-out1.getScore();
+ int pattern = out2.getPatn()-out1.getPatn();
+ int len = out1.getStem().length()-out2.getStem().length();
+
+ if(score!=0) return score;
+
+ if(out2.getScore()==AnalysisOutput.SCORE_CORRECT &&
+ out1.getScore()==AnalysisOutput.SCORE_CORRECT) {
+ pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern;
+ pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern;
+ }
+
+ if(pattern!=0) return pattern;
+
+ return len;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java?rev=1479410&r1=1479409&r2=1479410&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java Sun May 5 22:26:35 2013
@@ -21,67 +21,67 @@ package org.apache.lucene.analysis.kr.mo
* ë³µí©ëª
ì¬ì ê°ë³ë¨ì´ì ëí ì 보를 ë´ê³ ìë í´ëì¤
*/
public class CompoundEntry {
-
- private String word;
-
- private int offset = -1;
-
- private boolean exist = true;
-
- private char pos = PatternConstants.POS_NOUN;
-
- public CompoundEntry() {
-
- }
-
- public CompoundEntry(String w) {
- this.word = w;
- }
-
- public CompoundEntry(String w,int o) {
- this(w);
- this.offset = o;
- }
-
- public CompoundEntry(String w,int o, boolean is) {
- this(w,o);
- this.exist = is;
- }
-
- public CompoundEntry(String w,int o, boolean is, char p) {
- this(w,o,is);
- this.pos = p;
- }
-
- public void setWord(String w) {
- this.word = w;
- }
-
- public void setOffset(int o) {
- this.offset = o;
- }
-
- public String getWord() {
- return this.word;
- }
-
- public int getOffset() {
- return this.offset;
- }
-
- public boolean isExist() {
- return exist;
- }
-
- public void setExist(boolean is) {
- this.exist = is;
- }
-
- public char getPos() {
- return pos;
- }
+
+ private String word;
+
+ private int offset = -1;
+
+ private boolean exist = true;
+
+ private char pos = PatternConstants.POS_NOUN;
+
+ public CompoundEntry() {
+
+ }
+
+ public CompoundEntry(String w) {
+ this.word = w;
+ }
+
+ public CompoundEntry(String w,int o) {
+ this(w);
+ this.offset = o;
+ }
+
+ public CompoundEntry(String w,int o, boolean is) {
+ this(w,o);
+ this.exist = is;
+ }
+
+ public CompoundEntry(String w,int o, boolean is, char p) {
+ this(w,o,is);
+ this.pos = p;
+ }
+
+ public void setWord(String w) {
+ this.word = w;
+ }
+
+ public void setOffset(int o) {
+ this.offset = o;
+ }
+
+ public String getWord() {
+ return this.word;
+ }
+
+ public int getOffset() {
+ return this.offset;
+ }
+
+ public boolean isExist() {
+ return exist;
+ }
+
+ public void setExist(boolean is) {
+ this.exist = is;
+ }
+
+ public char getPos() {
+ return pos;
+ }
- public void setPos(char pos) {
- this.pos = pos;
- }
+ public void setPos(char pos) {
+ this.pos = pos;
+ }
}