You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/22 05:38:43 UTC
svn commit: r1534473 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/ test/org/apache/lucene/analysis/ko/
Author: rmuir
Date: Tue Oct 22 03:38:40 2013
New Revision: 1534473
URL: http://svn.apache.org/r1534473
Log:
LUCENE-4956: pull out broken acronym/etc handling, user can just use classicfilter for that
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/wiki_results.txt
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java?rev=1534473&r1=1534472&r2=1534473&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java Tue Oct 22 03:38:40 2013
@@ -24,6 +24,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
@@ -96,6 +97,7 @@ public class KoreanAnalyzer extends Stop
final KoreanTokenizer src = new KoreanTokenizer(reader);
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new LowerCaseFilter(matchVersion, src);
+ tok = new ClassicFilter(tok);
tok = new KoreanFilter(tok, bigrammable, hasOrigin, exactMatch, originCNoun);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1534473&r1=1534472&r2=1534473&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Tue Oct 22 03:38:40 2013
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ko;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
@@ -60,8 +59,6 @@ public final class KoreanFilter extends
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private static final String APOSTROPHE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.APOSTROPHE];
- private static final String ACRONYM_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.ACRONYM];
private static final String KOREAN_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN];
private static final String CHINESE_TYPE = KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE];
@@ -102,15 +99,15 @@ public final class KoreanFilter extends
}
while (input.incrementToken()) {
- currentState = captureState();
-
final String type = typeAtt.type();
- if(KOREAN_TYPE.equals(type)) {
+ if (KOREAN_TYPE.equals(type)) {
+ currentState = captureState();
analysisKorean(termAtt.toString());
- } else if(CHINESE_TYPE.equals(type)) {
+ } else if (CHINESE_TYPE.equals(type)) {
+ currentState = captureState();
analysisChinese(termAtt.toString());
} else {
- analysisETC(termAtt.toString());
+ return true; // pass anything else thru
}
if (!morphQueue.isEmpty()) {
@@ -372,31 +369,6 @@ public final class KoreanFilter extends
return cnAnalyzer.analyze(input);
}
- private void analysisETC(String term) {
-
- final char[] buffer = termAtt.buffer();
- final int bufferLength = termAtt.length();
- final String type = typeAtt.type();
-
- if (type == APOSTROPHE_TYPE && // remove 's
- bufferLength >= 2 &&
- buffer[bufferLength-2] == '\'' &&
- (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
- // Strip last 2 characters off
- morphQueue.add(new Token(term.substring(0,bufferLength - 2),0));
- } else if (type == ACRONYM_TYPE) { // remove dots
- int upto = 0;
- for(int i=0;i<bufferLength;i++) {
- char c = buffer[i];
- if (c != '.')
- buffer[upto++] = c;
- }
- morphQueue.add(new Token(term.substring(0,upto),0));
- } else {
- morphQueue.add(new Token(term,0));
- }
- }
-
private boolean isAlphaNumChar(int c) {
if((c>=48&&c<=57)||(c>=65&&c<=122)) return true;
return false;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java?rev=1534473&r1=1534472&r2=1534473&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java Tue Oct 22 03:38:40 2013
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -47,15 +48,15 @@ public final class KoreanTokenizer exten
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
- "<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
- "<NUM>",
- "<CJ>",
- "<ACRONYM_DEP>",
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ALPHANUM],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.COMPANY],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.EMAIL],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.NUM],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.CJ],
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM_DEP],
"<KOREAN>",
"<CHINESE>"
};
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java?rev=1534473&r1=1534472&r2=1534473&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java Tue Oct 22 03:38:40 2013
@@ -40,6 +40,13 @@ public class TestKoreanAnalyzer extends
new int[]{1, 1, 1, 1, 1, 1, 1}
);
}
+
+ // don't know why we have this, but it should at least do washington dc, not washington d.
+ public void testAcronym() throws IOException {
+ assertAnalyzesTo(new KoreanAnalyzer(TEST_VERSION_CURRENT), "Washington D.C.",
+ new String[] { "washington", "dc" }
+ );
+ }
public void testCompoundNoun() throws IOException {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/wiki_results.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/wiki_results.txt?rev=1534473&r1=1534472&r2=1534473&view=diff
==============================================================================
Binary files - no diff available.