You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/20 01:25:55 UTC
svn commit: r1533842 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/morph/ resources/org/apache/lucene/analysis/ko/

Author: uschindler
Date: Sat Oct 19 23:25:54 2013
New Revision: 1533842

URL: http://svn.apache.org/r1533842
Log:
LUCENE-4956: Fix stopwords file, Cleanup analyzer (load stopwords file, no hardcoded stops), and filter (fix broken incrementToken, implement reset), remove unused varaibles in CompoundNounAnalyzer

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java Sat Oct 19 23:25:54 2013
@@ -20,22 +20,16 @@ package org.apache.lucene.analysis.ko;
 import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Set;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;
 
 /**
- * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
- * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
+ * A Korean Analyzer
  */
 public class KoreanAnalyzer extends StopwordAnalyzerBase {
   
@@ -44,42 +38,23 @@ public class KoreanAnalyzer extends Stop
 
   private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
 
-  /**
-   * Specifies whether deprecated acronyms should be replaced with HOST type.
-   * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
-   */
-  private final boolean replaceInvalidAcronym;
-    
-  private Set<String> stopSet;
-    
   private boolean bigrammable = false;
     
   private boolean hasOrigin = false;
     
   private boolean exactMatch = false;
-    
   private boolean originCNoun = true;
   
   private boolean isPositionInc = true;
   
-  public static final String DIC_ENCODING = "UTF-8";
-
-  /** An unmodifiable set containing some common English words that are usually not
-   useful for searching. */
+  /** An unmodifiable set containing some common words that are usually not useful for searching. */
   public static final CharArraySet STOP_WORDS_SET; 
-    
-
   static {
-    List<String> stopWords = Arrays.asList(new String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", 
-        "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", 
-        "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
-        "ì´","ê·¸","ì ","ê²","ì","ë±","ë¤","ë°","ìì","ê·¸ë¦¬ê³ ","ê·¸ëì","ë","ëë"}
-    );
-    
-    CharArraySet stopSet = new CharArraySet(Version.LUCENE_42, stopWords.size(), false);
-   
-    stopSet.addAll(stopWords);
-    STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+    try {
+      STOP_WORDS_SET = loadStopwordSet(false, KoreanAnalyzer.class, "stopwords.txt", "#");
+    } catch (IOException ioe) {
+      throw new Error("Cannot load stop words", ioe);
+    }
   }
     
   public KoreanAnalyzer() {
@@ -90,7 +65,7 @@ public class KoreanAnalyzer extends Stop
    * ê²ìì ìí ííìë¶ì
    */
   public KoreanAnalyzer(boolean exactMatch) {
-    this(Version.LUCENE_42, STOP_WORDS_SET);      
+    this(Version.LUCENE_42, STOP_WORDS_SET);
     this.exactMatch = exactMatch;
   }
   
@@ -116,10 +91,8 @@ public class KoreanAnalyzer extends Stop
 
   public KoreanAnalyzer(Version matchVersion, CharArraySet stopWords) {
     super(matchVersion, stopWords); 
-    replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_42);     
   }
   
-  
   @Override
   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
     final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Sat Oct 19 23:25:54 2013
@@ -37,62 +37,37 @@ import org.apache.lucene.analysis.ko.mor
 import org.apache.lucene.analysis.ko.morph.PatternConstants;
 import org.apache.lucene.analysis.ko.morph.WordEntry;
 import org.apache.lucene.analysis.ko.morph.WordSpaceAnalyzer;
-import org.apache.lucene.analysis.standard.ClassicTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 public class KoreanFilter extends TokenFilter {
 
-  private LinkedList<IndexWord> morphQueue;
-  
-  private MorphAnalyzer morph;
-  
-  private WordSpaceAnalyzer wsAnal;
-  
-  private boolean bigrammable = true;
-  
-  private boolean hasOrigin = false;
-  
-  private boolean originCNoun = true;
-  
-  private boolean exactMatch = false;
-  
-  private boolean isPositionInc = true;
-  
-  private char[] curTermBuffer;
-    
-  private int curTermLength;
-    
-  private String curType;
-    
-  private String curSource;
+  private final LinkedList<IndexWord> morphQueue = new LinkedList<IndexWord>();;
+  private final MorphAnalyzer morph = new MorphAnalyzer();
+  private final WordSpaceAnalyzer wsAnal = new WordSpaceAnalyzer();
+  private final CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
+  
+  private State currentState = null;
+  
+  private final boolean bigrammable;
+  private final boolean hasOrigin;
+  private final boolean originCNoun;
+  private final boolean isPositionInc;
     
-  private int tokStart;
-    
-  private int hanStart = 0; // íê¸ì ìì ìì¹, ë³µí©ëªì¬ì¼ê²½ì°
-    
-  private int chStart = 0;
-    
-  private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
-  
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     
-  private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
-  private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+  private static final String APOSTROPHE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.APOSTROPHE];
+  private static final String ACRONYM_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.ACRONYM];
+  private static final String KOREAN_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN];
+  private static final String CHINESE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE];
     
   public KoreanFilter(TokenStream input) {
-    super(input);
-    morphQueue =  new LinkedList<IndexWord>();
-    morph = new MorphAnalyzer();
-    wsAnal = new WordSpaceAnalyzer();
-    cnAnalyzer.setExactMach(false);
+    this(input, true);
   }
 
   /**
@@ -101,76 +76,70 @@ public class KoreanFilter extends TokenF
    * @param bigram  Whether the bigram index term return or not.
    */
   public KoreanFilter(TokenStream input, boolean bigram) {
-    this(input);  
-    bigrammable = bigram;
+    this(input, bigram, false);
   }
   
   public KoreanFilter(TokenStream input, boolean bigram, boolean has) {
-    this(input, bigram);
-    hasOrigin = has;
+    this(input, bigram, has, false);
   }
   
-  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match) {
-    this(input, bigram,has);
-    this.exactMatch = match;
+  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch) {
+    this(input, bigram, has, exactMatch, true);
   }
-  
-  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun) {
-    this(input, bigram,has, match);
-    this.originCNoun = cnoun;
+
+  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch, boolean cnoun) {
+    this(input, bigram, has, exactMatch, cnoun, true);
   }
 
-  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun, boolean isPositionInc) {
-    this(input, bigram,has, match, cnoun);
+  public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean exactMatch, boolean cnoun, boolean isPositionInc) {
+    super(input);
+    cnAnalyzer.setExactMach(exactMatch);
+    this.bigrammable = bigram;
+    this.hasOrigin = has;
+    this.originCNoun = cnoun;
     this.isPositionInc = isPositionInc;
   }
   
   public final boolean incrementToken() throws IOException {
-
-    if(curTermBuffer!=null&&morphQueue.size()>0) {
-      setTermBufferByQueue(false);
+    if (!morphQueue.isEmpty()) {
+      restoreState(currentState);
+      setTermBufferByQueue();
       return true;
     }
 
-    if(!input.incrementToken()) return false;
-    
-    curTermBuffer = termAtt.buffer().clone();
-    curTermLength = termAtt.length();
-    tokStart = offsetAtt.startOffset();    
-    curType = typeAtt.type();
- 
-    if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) {            
-      analysisKorean(new String(curTermBuffer,0,termAtt.length()));
-    } else if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) {
-      analysisChinese(new String(curTermBuffer,0,termAtt.length()));
-    } else {
-      analysisETC(new String(curTermBuffer,0,termAtt.length()));
-    }        
-
-    if(morphQueue!=null&&morphQueue.size()>0) {
-      setTermBufferByQueue(true);  
-    } else {
-      return incrementToken();
+    while (input.incrementToken()) {
+      currentState = captureState();
+      
+      final String type = typeAtt.type();
+      if(KOREAN_TYPE.equals(type)) {            
+        analysisKorean(termAtt.toString());
+      } else if(CHINESE_TYPE.equals(type)) {
+        analysisChinese(termAtt.toString());
+      } else {
+        analysisETC(termAtt.toString());
+      }        
+  
+      if (!morphQueue.isEmpty()) {
+        // no need to restore state!
+        setTermBufferByQueue();
+        return true;
+      }
     }
 
-    return true;
-
+    return false;
   }
   
   /**
    * queueì ì ì¥ë ê°ì¼ë¡ bufferì ê°ì ë³µì¬íë¤.
    */
-  private void setTermBufferByQueue(boolean isFirst) {
-    
-    clearAttributes();
-        
+  private void setTermBufferByQueue() {
     IndexWord iw = morphQueue.removeFirst();
-
-    termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length());
-    offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getWord().length());
+    String word = iw.getWord();
     
-    int inc = isPositionInc ?  iw.getIncrement() : 0;
+    termAtt.setEmpty().append(word);
+    offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + word.length());
     
+    int inc = isPositionInc ?  iw.getIncrement() : 0;
     posIncrAtt.setPositionIncrement(inc);      
     
   }
@@ -206,7 +175,8 @@ public class KoreanFilter extends TokenF
           results.addAll(outputs);
         }
         extractKeyword(results, offsetAtt.startOffset(), map, 0);
-      }catch(Exception e) {
+      } catch(Exception e) {
+        // nocommit: Fix this stupidness with catch all Exceptions!
         extractKeyword(outputs.subList(0, 1), offsetAtt.startOffset(), map, 0);
       }
       
@@ -221,9 +191,7 @@ public class KoreanFilter extends TokenF
   
   }
   
-  private void extractKeyword(List<AnalysisOutput> outputs, int startoffset, Map<String,IndexWord> map, int position) 
-      
-  {
+  private void extractKeyword(List<AnalysisOutput> outputs, int startoffset, Map<String,IndexWord> map, int position) {
 
     int maxDecompounds = 0;
     int maxStem = 0;
@@ -450,20 +418,11 @@ public class KoreanFilter extends TokenF
     return false;
   }
   
-  public void setHasOrigin(boolean has) {
-    hasOrigin = has;
-  }
-
-  public void setExactMatch(boolean match) {
-    this.exactMatch = match;
-  }
-
-  /* nocommit: i think this is needed? @Override
+  @Override
   public void reset() throws IOException {
     super.reset();
     morphQueue.clear();
-    curTermBuffer = null;
-  }*/
-  
+    currentState = null;
+  }
   
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilterFactory.java Sat Oct 19 23:25:54 2013
@@ -52,25 +52,21 @@ public class KoreanFilterFactory extends
   
   private static final String IS_POSITION_INC = "incPosition";
 
-  private boolean bigrammable;
+  private final boolean bigrammable;
 
-  private boolean hasOrigin;
+  private final boolean hasOrigin;
 
-  private boolean hasCNoun;
+  private final boolean hasCNoun;
 
-  private boolean exactMatch;
+  private final boolean exactMatch;
   
-  private boolean isPositionInc;
+  private final boolean isPositionInc;
 
   /**
    * Initialize this factory via a set of key-value pairs.
    */
   public KoreanFilterFactory(Map<String, String> args) {
     super(args);
-    init(args);
-  }
-
-  public void init(Map<String, String> args) {
     bigrammable = getBoolean(args, BIGRAMMABLE_PARAM, true);
     hasOrigin = getBoolean(args, HAS_ORIGIN_PARAM, true);
     exactMatch = getBoolean(args, EXACT_MATCH_PARAM, false);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Sat Oct 19 23:25:54 2013
@@ -233,7 +233,6 @@ public class CompoundNounAnalyzer {
       return true;
     }
     
-    int score = 0;
     List<CompoundEntry> results = new ArrayList<CompoundEntry>();
         
     String prev = input.substring(0,pos);
@@ -241,9 +240,6 @@ public class CompoundNounAnalyzer {
     
     boolean pSucess = false;
     boolean rSuccess = false;
-    CompoundEntry pEntry = null;
-    CompoundEntry rEntry = null;
-    
     WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
     if(prvEntry==null) {
       pSucess = analyze(prev, results, false);
@@ -370,9 +366,7 @@ public class CompoundNounAnalyzer {
    */
   private CompoundEntry analyzeSingle(String input) {
             
-    boolean success = false;
     int score = AnalysisOutput.SCORE_ANALYSIS;
-    int ptn = PatternConstants.PTN_N;
     char pos = PatternConstants.POS_NOUN;
     if(input.length()==1) return  new CompoundEntry(input, 0, true,pos);
     
@@ -380,7 +374,6 @@ public class CompoundNounAnalyzer {
     if(entry!=null) {
       score = AnalysisOutput.SCORE_CORRECT;
       if(entry.getFeature(WordEntry.IDX_NOUN)!='1') {
-        ptn = PatternConstants.PTN_AID;
         pos = PatternConstants.POS_AID;
       }
     }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt?rev=1533842&r1=1533841&r2=1533842&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/stopwords.txt Sat Oct 19 23:25:54 2013
@@ -6,7 +6,6 @@
 #
 # This stopwords file has the same default set as KoreanAnalyzer
 #
-
 a
 an
 and
@@ -53,5 +52,3 @@ with
 ê·¸ëì
 ë
 ëë
-
-##### End of file