You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/20 01:25:55 UTC
svn commit: r1533842 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/morph/
resources/org/apache/lucene/analysis/ko/
Author: uschindler
Date: Sat Oct 19 23:25:54 2013
New Revision: 1533842
URL: http://svn.apache.org/r1533842
Log:
LUCENE-4956: Fix stopwords file, Cleanup analyzer (load stopwords file, no hardcoded stops), and filter (fix broken incrementToken, implement reset), remove unused varaibles in CompoundNounAnalyzer
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java Sat Oct 19 23:25:54 2013
@@ -20,22 +20,16 @@ package org.apache.lucene.analysis.ko;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/**
- * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
- * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
+ * A Korean Analyzer
*/
public class KoreanAnalyzer extends StopwordAnalyzerBase {
@@ -44,42 +38,23 @@ public class KoreanAnalyzer extends Stop
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
- /**
- * Specifies whether deprecated acronyms should be replaced with HOST type.
- * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
- */
- private final boolean replaceInvalidAcronym;
-
- private Set<String> stopSet;
-
private boolean bigrammable = false;
private boolean hasOrigin = false;
private boolean exactMatch = false;
-
private boolean originCNoun = true;
private boolean isPositionInc = true;
- public static final String DIC_ENCODING = "UTF-8";
-
- /** An unmodifiable set containing some common English words that are usually not
- useful for searching. */
+ /** An unmodifiable set containing some common words that are usually not useful for searching. */
public static final CharArraySet STOP_WORDS_SET;
-
-
static {
- List<String> stopWords = Arrays.asList(new String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
- "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
- "ì´","ê·¸","ì ","ê²","ì","ë±","ë¤","ë°","ìì","ê·¸ë¦¬ê³ ","ê·¸ëì","ë","ëë"}
- );
-
- CharArraySet stopSet = new CharArraySet(Version.LUCENE_42, stopWords.size(), false);
-
- stopSet.addAll(stopWords);
- STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+ try {
+ STOP_WORDS_SET = loadStopwordSet(false, KoreanAnalyzer.class, "stopwords.txt", "#");
+ } catch (IOException ioe) {
+ throw new Error("Cannot load stop words", ioe);
+ }
}
public KoreanAnalyzer() {
@@ -90,7 +65,7 @@ public class KoreanAnalyzer extends Stop
* ê²ìì ìí ííìë¶ì
*/
public KoreanAnalyzer(boolean exactMatch) {
- this(Version.LUCENE_42, STOP_WORDS_SET);
+ this(Version.LUCENE_42, STOP_WORDS_SET);
this.exactMatch = exactMatch;
}
@@ -116,10 +91,8 @@ public class KoreanAnalyzer extends Stop
public KoreanAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(matchVersion, stopWords);
- replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_42);
}
-
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Sat Oct 19 23:25:54 2013
@@ -37,62 +37,37 @@ import org.apache.lucene.analysis.ko.mor
import org.apache.lucene.analysis.ko.morph.PatternConstants;
import org.apache.lucene.analysis.ko.morph.WordEntry;
import org.apache.lucene.analysis.ko.morph.WordSpaceAnalyzer;
-import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class KoreanFilter extends TokenFilter {
- private LinkedList<IndexWord> morphQueue;
-
- private MorphAnalyzer morph;
-
- private WordSpaceAnalyzer wsAnal;
-
- private boolean bigrammable = true;
-
- private boolean hasOrigin = false;
-
- private boolean originCNoun = true;
-
- private boolean exactMatch = false;
-
- private boolean isPositionInc = true;
-
- private char[] curTermBuffer;
-
- private int curTermLength;
-
- private String curType;
-
- private String curSource;
+ private final LinkedList<IndexWord> morphQueue = new LinkedList<IndexWord>();;
+ private final MorphAnalyzer morph = new MorphAnalyzer();
+ private final WordSpaceAnalyzer wsAnal = new WordSpaceAnalyzer();
+ private final CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
+
+ private State currentState = null;
+
+ private final boolean bigrammable;
+ private final boolean hasOrigin;
+ private final boolean originCNoun;
+ private final boolean isPositionInc;
- private int tokStart;
-
- private int hanStart = 0; // íê¸ì ìì ìì¹, ë³µí©ëª
ì¬ì¼ê²½ì°
-
- private int chStart = 0;
-
- private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
-
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
- private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+ private static final String APOSTROPHE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.APOSTROPHE];
+ private static final String ACRONYM_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.ACRONYM];
+ private static final String KOREAN_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN];
+ private static final String CHINESE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE];
public KoreanFilter(TokenStream input) {
- super(input);
- morphQueue = new LinkedList<IndexWord>();
- morph = new MorphAnalyzer();
- wsAnal = new WordSpaceAnalyzer();
- cnAnalyzer.setExactMach(false);
+ this(input, true);
}
/**
@@ -101,76 +76,70 @@ public class KoreanFilter extends TokenF
* @param bigram Whether the bigram index term return or not.
*/
public KoreanFilter(TokenStream input, boolean bigram) {
- this(input);
- bigrammable = bigram;
+ this(input, bigram, false);
}
public KoreanFilter(TokenStream input, boolean bigram, boolean has) {
- this(input, bigram);
- hasOrigin = has;
+ this(input, bigram, has, false);
}
- public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match) {
- this(input, bigram,has);
- this.exactMatch = match;
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch) {
+ this(input, bigram, has, exactMatch, true);
}
-
- public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun) {
- this(input, bigram,has, match);
- this.originCNoun = cnoun;
+
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch, boolean cnoun) {
+ this(input, bigram, has, exactMatch, cnoun, true);
}
- public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun, boolean isPositionInc) {
- this(input, bigram,has, match, cnoun);
+ public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch, boolean cnoun, boolean isPositionInc) {
+ super(input);
+ cnAnalyzer.setExactMach(exactMatch);
+ this.bigrammable = bigram;
+ this.hasOrigin = has;
+ this.originCNoun = cnoun;
this.isPositionInc = isPositionInc;
}
public final boolean incrementToken() throws IOException {
-
- if(curTermBuffer!=null&&morphQueue.size()>0) {
- setTermBufferByQueue(false);
+ if (!morphQueue.isEmpty()) {
+ restoreState(currentState);
+ setTermBufferByQueue();
return true;
}
- if(!input.incrementToken()) return false;
-
- curTermBuffer = termAtt.buffer().clone();
- curTermLength = termAtt.length();
- tokStart = offsetAtt.startOffset();
- curType = typeAtt.type();
-
- if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) {
- analysisKorean(new String(curTermBuffer,0,termAtt.length()));
- } else if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) {
- analysisChinese(new String(curTermBuffer,0,termAtt.length()));
- } else {
- analysisETC(new String(curTermBuffer,0,termAtt.length()));
- }
-
- if(morphQueue!=null&&morphQueue.size()>0) {
- setTermBufferByQueue(true);
- } else {
- return incrementToken();
+ while (input.incrementToken()) {
+ currentState = captureState();
+
+ final String type = typeAtt.type();
+ if(KOREAN_TYPE.equals(type)) {
+ analysisKorean(termAtt.toString());
+ } else if(CHINESE_TYPE.equals(type)) {
+ analysisChinese(termAtt.toString());
+ } else {
+ analysisETC(termAtt.toString());
+ }
+
+ if (!morphQueue.isEmpty()) {
+ // no need to restore state!
+ setTermBufferByQueue();
+ return true;
+ }
}
- return true;
-
+ return false;
}
/**
* queueì ì ì¥ë ê°ì¼ë¡ bufferì ê°ì ë³µì¬íë¤.
*/
- private void setTermBufferByQueue(boolean isFirst) {
-
- clearAttributes();
-
+ private void setTermBufferByQueue() {
IndexWord iw = morphQueue.removeFirst();
-
- termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length());
- offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getWord().length());
+ String word = iw.getWord();
- int inc = isPositionInc ? iw.getIncrement() : 0;
+ termAtt.setEmpty().append(word);
+ offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + word.length());
+ int inc = isPositionInc ? iw.getIncrement() : 0;
posIncrAtt.setPositionIncrement(inc);
}
@@ -206,7 +175,8 @@ public class KoreanFilter extends TokenF
results.addAll(outputs);
}
extractKeyword(results, offsetAtt.startOffset(), map, 0);
- }catch(Exception e) {
+ } catch(Exception e) {
+ // nocommit: Fix this stupidness with catch all Exceptions!
extractKeyword(outputs.subList(0, 1), offsetAtt.startOffset(), map, 0);
}
@@ -221,9 +191,7 @@ public class KoreanFilter extends TokenF
}
- private void extractKeyword(List<AnalysisOutput> outputs, int startoffset, Map<String,IndexWord> map, int position)
-
- {
+ private void extractKeyword(List<AnalysisOutput> outputs, int startoffset, Map<String,IndexWord> map, int position) {
int maxDecompounds = 0;
int maxStem = 0;
@@ -450,20 +418,11 @@ public class KoreanFilter extends TokenF
return false;
}
- public void setHasOrigin(boolean has) {
- hasOrigin = has;
- }
-
- public void setExactMatch(boolean match) {
- this.exactMatch = match;
- }
-
- /* nocommit: i think this is needed? @Override
+ @Override
public void reset() throws IOException {
super.reset();
morphQueue.clear();
- curTermBuffer = null;
- }*/
-
+ currentState = null;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java Sat Oct 19 23:25:54 2013
@@ -52,25 +52,21 @@ public class KoreanFilterFactory extends
private static final String IS_POSITION_INC = "incPosition";
- private boolean bigrammable;
+ private final boolean bigrammable;
- private boolean hasOrigin;
+ private final boolean hasOrigin;
- private boolean hasCNoun;
+ private final boolean hasCNoun;
- private boolean exactMatch;
+ private final boolean exactMatch;
- private boolean isPositionInc;
+ private final boolean isPositionInc;
/**
* Initialize this factory via a set of key-value pairs.
*/
public KoreanFilterFactory(Map<String, String> args) {
super(args);
- init(args);
- }
-
- public void init(Map<String, String> args) {
bigrammable = getBoolean(args, BIGRAMMABLE_PARAM, true);
hasOrigin = getBoolean(args, HAS_ORIGIN_PARAM, true);
exactMatch = getBoolean(args, EXACT_MATCH_PARAM, false);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Sat Oct 19 23:25:54 2013
@@ -233,7 +233,6 @@ public class CompoundNounAnalyzer {
return true;
}
- int score = 0;
List<CompoundEntry> results = new ArrayList<CompoundEntry>();
String prev = input.substring(0,pos);
@@ -241,9 +240,6 @@ public class CompoundNounAnalyzer {
boolean pSucess = false;
boolean rSuccess = false;
- CompoundEntry pEntry = null;
- CompoundEntry rEntry = null;
-
WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
if(prvEntry==null) {
pSucess = analyze(prev, results, false);
@@ -370,9 +366,7 @@ public class CompoundNounAnalyzer {
*/
private CompoundEntry analyzeSingle(String input) {
- boolean success = false;
int score = AnalysisOutput.SCORE_ANALYSIS;
- int ptn = PatternConstants.PTN_N;
char pos = PatternConstants.POS_NOUN;
if(input.length()==1) return new CompoundEntry(input, 0, true,pos);
@@ -380,7 +374,6 @@ public class CompoundNounAnalyzer {
if(entry!=null) {
score = AnalysisOutput.SCORE_CORRECT;
if(entry.getFeature(WordEntry.IDX_NOUN)!='1') {
- ptn = PatternConstants.PTN_AID;
pos = PatternConstants.POS_AID;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt Sat Oct 19 23:25:54 2013
@@ -6,7 +6,6 @@
#
# This stopwords file has the same default set as KoreanAnalyzer
#
-
a
an
and
@@ -53,5 +52,3 @@ with
ê·¸ëì
ë
ëë
-
-##### End of file