You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/04 14:34:14 UTC
svn commit: r1296805 [3/3] - in /lucene/dev/trunk: lucene/contrib/
lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/
lucene/core/src/java/org/apache/lucene/util/
lucene/core/src/java/org/apache/lucene/util/fst/
lucene/core/src/test/org/a...
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1296805&r1=1296804&r2=1296805&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Sun Mar 4 13:34:13 2012
@@ -23,29 +23,17 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
- private UserDictionary readDict() throws IOException {
- InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
- if (is == null)
- throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
- try {
- Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
- return new UserDictionary(reader);
- } finally {
- is.close();
- }
- }
-
@Test
public void testLookup() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
String s = "é¢è¥¿å½é空港ã«è¡ã£ã";
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three é¢è¥¿, å½é, 空港
@@ -69,7 +57,7 @@ public class UserDictionaryTest extends
@Test
public void testReadings() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
@@ -83,7 +71,7 @@ public class UserDictionaryTest extends
@Test
public void testPartOfSpeech() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdKeizai = result[1][0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
@@ -92,7 +80,7 @@ public class UserDictionaryTest extends
@Test
public void testRead() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
assertNotNull(dictionary);
}
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt?rev=1296805&r1=1296804&r2=1296805&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt Sun Mar 4 13:34:13 2012
@@ -25,43 +25,45 @@
##
# Kansai Internationl Airport
-é¢è¥¿å½é空港 é¢è¥¿ å½é 空港
+é¢è¥¿å½é空港 é¢è¥¿ é¢è¥¿å½é空港/0 å½é 空港
# Narita Airport
-æç°ç©ºæ¸¯ æç° ç©ºæ¸¯
+æç°ç©ºæ¸¯ æç° æç°ç©ºæ¸¯/0 空港
# Haneda Airport
-ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç©ºæ¸¯
+ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç¾½ç°ç©ºæ¸¯/0 空港
# Nara Institute of Science and Technology
-å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å
端 ç§å¦ æè¡ å¤§å¦é¢ 大å¦
+å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å¥è¯å
端ç§å¦æè¡å¤§å¦é¢å¤§å¦/0 å
端 ç§å¦ æè¡ å¤§å¦é¢ 大å¦
# Tokyo University
-æ±äº¬å¤§å¦ æ±äº¬ 大å¦
+æ±äº¬å¤§å¦ æ±äº¬ æ±äº¬å¤§å¦/0 大å¦
# Kyoto University
-京é½å¤§å¦ äº¬é½ å¤§å¦
+京é½å¤§å¦ äº¬é½ äº¬é½å¤§å¦/0 大å¦
+
+# NOTE: differs from non-compound mode:
# Kyoto University Baseball Club
-京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½ å¤§å¦ ç¡¬å¼ éç é¨
+京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½å¤§ å¦ ç¡¬å¼ éç é¨
##
## Katakana titles
##
# Senior Software Engineer
-ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
+ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢/0 ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
# Software Engineer
ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢
# Senior Project Manager
-ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ããã¸ã§ã¯ã ããã¸ã£ã¼
+ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼/0 ããã¸ã§ã¯ã ããã¸ã£ã¼
# Project Manager
ããã¸ã§ã¯ãããã¸ã£ã¼ ããã¸ã§ã¯ã ããã¸ã£ã¼
# Senior Sales Engineer
-ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã»ã¼ã«ã¹ ã¨ã³ã¸ãã¢
+ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ãã¢/0 ã»ã¼ã«ã¹ ã¨ã³ã¸ãã¢
# System Architect
-ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã¢ã¼ããã¯ã
+ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã¢ã¼ããã¯ã
# Senior System Architect
-ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ã¹ãã ã¢ã¼ããã¯ã
+ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã·ã¹ãã ã¢ã¼ããã¯ã
# System Administrator
ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿
-ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã¢ãããã¹ãã¬ã¼ã¿ã¼
# Senior System Administrator
-ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
+ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼
##
## Company names (several are fictitious)
@@ -70,25 +72,25 @@
# SoftBank Mobile
ã½ãããã³ã¯ã¢ãã¤ã« ã½ãããã³ã¯ ã¢ãã¤ã«
# Alpine Materials
-ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ãããªã¢ã«ãº
+ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ã¢ã«ãã¤ã³ãããªã¢ã«ãº/0 ãããªã¢ã«ãº
# Sapporo Holdings
ãµããããã¼ã«ãã£ã³ã°ã¹ ãµããã ãã¼ã«ãã£ã³ã°ã¹
# Yamada Corporation
-ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã³ã¼ãã¬ã¼ã·ã§ã³
+ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³/0 ã³ã¼ãã¬ã¼ã·ã§ã³
# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor
-ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã
+ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã/0 ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã
# Orental Chain
-ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãã¨ã³
+ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãªãªã¨ã³ã¿ã«ãã¨ã³/0 ãã¨ã³
# Ally Projects Japan NOTE: Becomes one token as ããã¸ã§ã¯ã is not in IPADIC
ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³ ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³
# Peter Pan Corporation
-ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³
+ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³/0 ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³
# AIM Create
ã¨ã¤ã ã¯ãªã¨ã¤ã ã¨ã¤ã ã¯ãªã¨ã¤ã
# Mars Engineering
-ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ã¨ã³ã¸ãã¢ãªã³ã°
+ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã°/0 ã¨ã³ã¸ãã¢ãªã³ã°
# Fuji Protein Technology
-ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãããã¤ã³ ãã¯ããã¸ã¼
+ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãã¸ãããã¤ã³ãã¯ããã¸ã¼/0 ãããã¤ã³ ãã¯ããã¸ã¼
##
## Person names
@@ -100,7 +102,7 @@
ã¹ãã£ã¼ãã¸ã§ã㺠ã¹ãã£ã¼ã ã¸ã§ããº
# Harry Potter NOTE: Becomes one token (short word)
ããªã¼ããã¿ã¼ ããªã¼ããã¿ã¼
-# Bill Gates NOTE: Becomes one token (short work)
+# Bill Gates NOTE: Becomes one token (short word)
ãã«ã²ã¤ã ãã«ã²ã¤ã
# Sean Connery NOTE: Becomes one token (okay)
ã·ã§ã¼ã³ã³ããªã¼ ã·ã§ã¼ã³ã³ããªã¼
@@ -133,8 +135,8 @@
##
# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
-ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ãã£ã¨ã³ ã¸ã㢠ãªã³ã°
+ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã°/0 ãã£ã¨ã³ ã¸ã㢠ãªã³ã°
# Anchovy pasta NOTE: Become Anch yvipasta
-ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã§ããã¹ã¿
+ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã¢ã³ãã§ããã¹ã¿/0 ã§ããã¹ã¿
# Surprise gift NOTE: Becomes one token (surprise not in IPADIC)
ãµãã©ã¤ãºã®ãã ãµãã©ã¤ãºã®ãã
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt?rev=1296805&r1=1296804&r2=1296805&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt Sun Mar 4 13:34:13 2012
@@ -4,3 +4,7 @@
# Custom reading for sumo wrestler
æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å
+
+# Silly entry:
+abcd,a b cd,foo1 foo2 foo3,bar
+abcdefg,ab cd efg,foo1 foo2 foo4,bar
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1296805&r1=1296804&r2=1296805&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Sun Mar 4 13:34:13 2012
@@ -28,8 +28,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory ex
@Override
public Tokenizer create(Reader input) {
- return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+ return new KuromojiTokenizer(input, userDictionary, true, mode);
}
private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory ex
if (mode != null) {
return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
} else {
- return Segmenter.DEFAULT_MODE;
+ return KuromojiTokenizer.DEFAULT_MODE;
}
}
}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java?rev=1296805&r1=1296804&r2=1296805&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java Sun Mar 4 13:34:13 2012
@@ -50,7 +50,7 @@ public class TestKuromojiTokenizerFactor
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
assertTokenStreamContents(ts,
- new String[] { "ã·ãã¢", "ã½ããã¦ã§ã¢", "ã¨ã³ã¸ãã¢" }
+ new String[] { "ã·ãã¢", "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", "ã½ããã¦ã§ã¢", "ã¨ã³ã¸ãã¢" }
);
}