You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 19:04:54 UTC

svn commit: r1229660 [2/2] - in /lucene/dev/branches/lucene3305: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ modules/analysis/kuromoji/s...

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,410 @@
+# set of default stop tags:
+# uncomment a part of speech to treat those words as stopwords.
+# the entire tagset is provided here for convenience.
+#
+#####
+#  noun: unclassified nouns
+#名詞
+#
+#  noun-common: Common nouns or nouns where the sub-classification is undefined
+#名詞-一般
+#
+#  noun-proper: Proper nouns where the sub-classification is undefined 
+#名詞-固有名詞
+#
+#  noun-proper-misc: miscellaneous proper nouns
+#名詞-固有名詞-一般
+#
+#  noun-proper-person: Personal names where the sub-classification is undefined
+#名詞-固有名詞-人名
+#
+#  noun-proper-person-misc: names that cannot be divided into surname and 
+#  given name; foreign names; names where the surname or given name is unknown.
+#  e.g. お市の方
+#名詞-固有名詞-人名-一般
+#
+#  noun-proper-person-surname: Mainly Japanese surnames.
+#  e.g. 山田
+#名詞-固有名詞-人名-姓
+#
+#  noun-proper-person-given_name: Mainly Japanese given names.
+#  e.g. 太郎
+#名詞-固有名詞-人名-名
+#
+#  noun-proper-organization: Names representing organizations.
+#  e.g. 通産省, NHK
+#名詞-固有名詞-組織
+#
+#  noun-proper-place: Place names where the sub-classification is undefined
+#名詞-固有名詞-地域
+#
+#  noun-proper-place-misc: Place names excluding countries.
+#  e.g. アジア, バルセロナ, 京都
+#名詞-固有名詞-地域-一般
+#
+#  noun-proper-place-country: Country names. 
+#  e.g. 日本, オーストラリア
+#名詞-固有名詞-地域-国
+#
+#  noun-pronoun: Pronouns where the sub-classification is undefined
+#名詞-代名詞
+#
+#  noun-pronoun-misc: miscellaneous pronouns: 
+#  e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
+#名詞-代名詞-一般
+#
+#  noun-pronoun-contraction: Spoken language contraction made by combining a 
+#  pronoun and the particle 'wa'.
+#  e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ 
+#名詞-代名詞-縮約
+#
+#  noun-adverbial: Temporal nouns such as names of days or months that behave 
+#  like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+#  e.g. 金曜, 一月, 午後, 少量
+#名詞-副詞可能
+#
+#  noun-verbal: Nouns that take arguments with case and can appear followed by 
+#  'suru' and related verbs (する, できる, なさる, くださる)
+#  e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
+#名詞-サ変接続
+#
+#  noun-adjective-base: The base form of adjectives, words that appear before な ("na")
+#  e.g. 健康, 安易, 駄目, だめ
+#名詞-形容動詞語幹
+#
+#  noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
+#  e.g. 0, 1, 2, 何, 数, 幾
+#名詞-数
+#
+#  noun-affix: noun affixes where the sub-classification is undefined
+#名詞-非自立
+#
+#  noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that 
+#  attach to the base form of inflectional words, words that cannot be classified 
+#  into any of the other categories below. This category includes indefinite nouns.
+#  e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, 
+#       順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, 
+#       拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
+#       わり, 割り, 割, ん-口語/, もん-口語/
+#名詞-非自立-一般
+#
+#  noun-affix-adverbial: noun affixes that that can behave as adverbs.
+#  e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, 
+#       上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, 
+#       最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, 
+#       とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, 
+#       儘, 侭, みぎり, 矢先
+#名詞-非自立-副詞可能
+#
+#  noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars 
+#  with the stem よう(だ) ("you(da)").
+#  e.g.  よう, やう, 様 (よう)
+#名詞-非自立-助動詞語幹
+#  
+#  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+#  connection form な (aux "da").
+#  e.g. みたい, ふう
+#名詞-非自立-形容動詞語幹
+#
+#  noun-special: special nouns where the sub-classification is undefined.
+#名詞-特殊
+#
+#  noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is 
+#  treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base 
+#  form of inflectional words.
+#  e.g. そう
+#名詞-特殊-助動詞語幹
+#
+#  noun-suffix: noun suffixes where the sub-classification is undefined.
+#名詞-接尾
+#
+#  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect 
+#  to ガル or タイ and can combine into compound nouns, words that cannot be classified into
+#  any of the other categories below. In general, this category is more inclusive than 
+#  接尾語 ("suffix") and is usually the last element in a compound noun.
+#  e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
+#       よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
+#名詞-接尾-一般
+#
+#  noun-suffix-person: Suffixes that form nouns and attach to person names more often
+#  than other nouns.
+#  e.g. 君, 様, 著
+#名詞-接尾-人名
+#
+#  noun-suffix-place: Suffixes that form nouns and attach to place names more often 
+#  than other nouns.
+#  e.g. 町, 市, 県
+#名詞-接尾-地域
+#
+#  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that 
+#  can appear before スル ("suru").
+#  e.g. 化, 視, 分け, 入り, 落ち, 買い
+#名詞-接尾-サ変接続
+#
+#  noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, 
+#  is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the 
+#  conjunctive form of inflectional words.
+#  e.g. そう
+#名詞-接尾-助動詞語幹
+#
+#  noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive 
+#  form of inflectional words and appear before the copula だ ("da").
+#  e.g. 的, げ, がち
+#名詞-接尾-形容動詞語幹
+#
+#  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+#  e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
+#名詞-接尾-副詞可能
+#
+#  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category 
+#  is more inclusive than 助数詞 ("classifier") and includes common nouns that attach 
+#  to numbers.
+#  e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
+#名詞-接尾-助数詞
+#
+#  noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+#  e.g. (楽し) さ, (考え) 方
+#名詞-接尾-特殊
+#
+#  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words 
+#  together.
+#  e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
+#名詞-接続詞的
+#
+#  noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are 
+#  semantically verb-like.
+#  e.g. ごらん, ご覧, 御覧, 頂戴
+#名詞-動詞非自立的
+#
+#  noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, 
+#  dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") 
+#  is いわく ("iwaku").
+#名詞-引用文字列
+#
+#  noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
+#  behave like an adjective.
+#  e.g. 申し訳, 仕方, とんでも, 違い
+#名詞-ナイ形容詞語幹
+#
+#####
+#  prefix: unclassified prefixes
+接頭詞
+#
+#  prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) 
+#  excluding numerical expressions.
+#  e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
+接頭詞-名詞接続
+#
+#  prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+#  in conjunctive form followed by なる/なさる/くださる.
+#  e.g. お (読みなさい), お (座り)
+接頭詞-動詞接続
+#
+#  prefix-adjectival: Prefixes that attach to adjectives.
+#  e.g. お (寒いですねえ), バカ (でかい)
+接頭詞-形容詞接続
+#
+#  prefix-numerical: Prefixes that attach to numerical expressions.
+#  e.g. 約, およそ, 毎時
+接頭詞-数接続
+#
+#####
+#  verb: unclassified verbs
+#動詞
+#
+#  verb-main:
+#動詞-自立
+#
+#  verb-auxiliary:
+動詞-非自立
+#
+#  verb-suffix:
+#動詞-接尾
+#
+#####
+#  adjective: unclassified adjectives
+#形容詞
+#
+#  adjective-main:
+#形容詞-自立
+#
+#  adjective-auxiliary:
+#形容詞-非自立
+#
+#  adjective-suffix:
+#形容詞-接尾
+#
+#####
+#  adverb: unclassified adverbs
+#副詞
+#
+#  adverb-misc: Words that can be segmented into one unit and where adnominal 
+#  modification is not possible.
+#  e.g. あいかわらず, 多分
+#副詞-一般
+#
+#  adverb-particle_conjunction: Adverbs that can be followed by の, は, に, 
+#  な, する, だ, etc.
+#  e.g. こんなに, そんなに, あんなに, なにか, なんでも
+#副詞-助詞類接続
+#
+#####
+#  adnominal: Words that only have noun-modifying forms.
+#  e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, 
+#       どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, 
+#       「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
+#連体詞
+#
+#####
+#  conjunction: Conjunctions that can occur independently.
+#  e.g. が, けれども, そして, じゃあ, それどころか
+接続詞
+#
+#####
+#  particle: unclassified particles.
+助詞
+#
+#  particle-case: case particles where the subclassification is undefined.
+助詞-格助詞
+#
+#  particle-case-misc: Case particles.
+#  e.g. から, が, で, と, に, へ, より, を, の, にて
+助詞-格助詞-一般
+#
+#  particle-case-quote: the "to" that appears after nouns, a person’s speech, 
+#  quotation marks, expressions of decisions from a meeting, reasons, judgements,
+#  conjectures, etc.
+#  e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
+助詞-格助詞-引用
+#
+#  particle-case-compound: Compounds of particles and verbs that mainly behave 
+#  like case particles.
+#  e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
+#       にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, 
+#       にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, 
+#       に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, 
+#       に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
+#       にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, 
+#       にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
+#       って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
+助詞-格助詞-連語
+#
+#  particle-conjunctive:
+#  e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, 
+#       ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, 
+#       (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
+助詞-接続助詞
+#
+#  particle-dependency:
+#  e.g. こそ, さえ, しか, すら, は, も, ぞ
+助詞-係助詞
+#
+#  particle-adverbial:
+#  e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, 
+#       (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
+#       (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, 
+#       (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
+#       ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
+助詞-副助詞
+#
+#  particle-interjective: particles with interjective grammatical roles.
+#  e.g. (松島) や
+助詞-間投助詞
+#
+#  particle-coordinate:
+#  e.g. と, たり, だの, だり, とか, なり, や, やら
+助詞-並立助詞
+#
+#  particle-final:
+#  e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, 
+#       ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
+助詞-終助詞
+#
+#  particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is 
+#  adverbial, conjunctive, or sentence final. For example:
+#       (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
+#       (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
+#           「(祈りが届いたせい) か (, 試験に合格した.)」
+#       (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
+#  e.g. か
+助詞-副助詞/並立助詞/終助詞
+#
+#  particle-adnominalizer: The "no" that attaches to nouns and modifies 
+#  non-inflectional words.
+助詞-連体化
+#
+#  particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs 
+#  that are giongo, giseigo, or gitaigo.
+#  e.g. に, と
+助詞-副詞化
+#
+#  particle-special: A particle that does not fit into one of the above classifications. 
+#  This includes particles that are used in Tanka, Haiku, and other poetry.
+#  e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
+助詞-特殊
+#
+#####
+#  auxiliary-verb:
+助動詞
+#
+#####
+#  interjection: Greetings and other exclamations.
+#  e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, 
+#       いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
+感動詞
+#
+#####
+#  symbol: unclassified Symbols.
+#記号
+#
+#  symbol-misc: A general symbol not in one of the categories below.
+#  e.g. [○◎@$〒→+]
+記号-一般
+#
+#  symbol-comma: Commas
+#  e.g. [,、]
+記号-読点
+#
+#  symbol-period: Periods and full stops.
+#  e.g. [..。]
+記号-句点
+#
+#  symbol-space: Full-width whitespace.
+記号-空白
+#
+#  symbol-open_bracket:
+#  e.g. [({‘“『【]
+記号-括弧開
+#
+#  symbol-close_bracket:
+#  e.g. [)}’”』」】]
+記号-括弧閉
+#
+#  symbol-alphabetic:
+#記号-アルファベット
+#
+#####
+#  other: unclassified other
+#その他
+#
+#  other-interjection: Words that are hard to classify as noun-suffixes or 
+#  sentence-final particles.
+#  e.g. (だ)ァ
+その他-間投
+#
+#####
+#  filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+#  e.g. あの, うんと, えと
+フィラー
+#
+#####
+#  non-verbal: non-verbal sound.
+非言語音
+#
+#####
+#  fragment:
+#語断片
+#
+#####
+#  unknown: unknown part of speech.
+#未知語

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,13 @@
+# short set of japanese stopwords
+いう
+する
+人物
+さま
+すること
+ため
+もの
+おいて
+なる
+できる
+おく
+ある

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Tue Jan 10 18:04:53 2012
@@ -165,15 +165,15 @@ public class SegmenterTest extends Lucen
   public void testPartOfSpeech() {
     List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
     assertEquals(9, tokens.size());
-    assertEquals("名詞,代名詞,一般,*",  tokens.get(0).getPartOfSpeech());
-    assertEquals("助詞,係助詞,*,*",    tokens.get(1).getPartOfSpeech());
-    assertEquals("副詞,助詞類接続,*,*", tokens.get(2).getPartOfSpeech());
-    assertEquals("名詞,サ変接続,*,*",   tokens.get(3).getPartOfSpeech());
-    assertEquals("名詞,一般,*,*",      tokens.get(4).getPartOfSpeech());
-    assertEquals("助詞,格助詞,一般,*",  tokens.get(5).getPartOfSpeech());
-    assertEquals("動詞,自立,*,*",      tokens.get(6).getPartOfSpeech());
-    assertEquals("助動詞,*,*,*",       tokens.get(7).getPartOfSpeech());
-    assertEquals("記号,句点,*,*",      tokens.get(8).getPartOfSpeech());
+    assertEquals("名詞-代名詞-一般",  tokens.get(0).getPartOfSpeech());
+    assertEquals("助詞-係助詞",    tokens.get(1).getPartOfSpeech());
+    assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
+    assertEquals("名詞-サ変接続",   tokens.get(3).getPartOfSpeech());
+    assertEquals("名詞-一般",      tokens.get(4).getPartOfSpeech());
+    assertEquals("助詞-格助詞-一般",  tokens.get(5).getPartOfSpeech());
+    assertEquals("動詞-自立",      tokens.get(6).getPartOfSpeech());
+    assertEquals("助動詞",       tokens.get(7).getPartOfSpeech());
+    assertEquals("記号-句点",      tokens.get(8).getPartOfSpeech());
   }
   
   public void testBocchan() throws Exception {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java Tue Jan 10 18:04:53 2012
@@ -14,8 +14,7 @@ import org.apache.lucene.util.Version;
 public class SimpleBench {
   
   public static void main(String args[]) throws Exception {
-    Segmenter segmenter = new Segmenter();
-    Analyzer a = new KuromojiAnalyzer(segmenter);
+    Analyzer a = new KuromojiAnalyzer(Version.LUCENE_CURRENT);
     Analyzer b = new CJKAnalyzer(Version.LUCENE_CURRENT);
     
     /* slight warmup */

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java Tue Jan 10 18:04:53 2012
@@ -25,24 +25,17 @@ import org.apache.lucene.analysis.BaseTo
 import org.apache.lucene.analysis.Tokenizer;
 
 public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
-
-  public void setUp() throws Exception {
-    super.setUp();
-    final Segmenter segmenter = new Segmenter();
-    analyzer = new Analyzer() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
-        return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
-      }
-    };
-  }
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
+    }
+  };
   
   public void testBasics() throws IOException {
-    assertAnalyzesTo(analyzer, "それはまだ実験段階にあります。",
-        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます", "。" }
+    assertAnalyzesTo(analyzer, "それはまだ実験段階にあります",
+        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます"  }
     );
   }
   

Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (from r1229589, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java&r1=1229589&r2=1229660&rev=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Tue Jan 10 18:04:53 2012
@@ -17,35 +17,37 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import java.io.Reader;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util._TestUtil;
 
-public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
-
-  public void setUp() throws Exception {
-    super.setUp();
-    final Segmenter segmenter = new Segmenter();
-    analyzer = new KuromojiAnalyzer(segmenter);
-  }
+public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
   
   public void testDecomposition1() throws Exception {
     assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
                          "アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
-     new String[] { "本来", "は", "、", "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",      
-                    "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある", "、", "アメリカ", 
-                    "低", "所得", "者", "医療", "援助", "制度", "が", "、", "今日", "で", "は", "、", "その",
-                    "予算", "の", "約", "3", "分の", "1", "を", "老人", "に", "費やし", "て", "いる", "。" },
-     new int[] { 0, 2, 3, 4, 6, 7,  8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 
-                 31, 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60,
-                 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 },
-     new int[] { 2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
-                 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60, 62,
-                 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78, 79 }
+     new String[] { "本来", "は",  "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",      
+                    "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある",  "アメリカ", 
+                    "低", "所得", "者", "医療", "援助", "制度", "が",  "今日", "で", "は",  "その",
+                    "予算", "の", "約", "3", "分の", "1", "を", "老人", "に", "費やし", "て", "いる" },
+     new int[] { 0, 2, 4, 6, 7,  8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 
+                 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60,
+                 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
+     new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
+                 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62,
+                 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
     );
   }
   
@@ -59,9 +61,9 @@ public class TestKuromojiAnalyzer extend
   
   public void testDecomposition3() throws Exception {
     assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
-      new String[] { "魔女", "狩", "大将", "マシュー", "・", "ホプキンス", "。" },
-      new int[] { 0, 2, 3, 5,  9, 10, 15 },
-      new int[] { 2, 3, 5, 9, 10, 15, 16 }
+      new String[] { "魔女", "狩", "大将", "マシュー",  "ホプキンス" },
+      new int[] { 0, 2, 3, 5, 10 },
+      new int[] { 2, 3, 5, 9, 15 }
     );
   }
 
@@ -84,9 +86,9 @@ public class TestKuromojiAnalyzer extend
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
     assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
-      new String[] { "魔女", "狩", "大将", "マシュー", "・", "ホプキンス", "。", " ", "魔女", "狩", "大将", "マシュー", "・", "ホプキンス", "。" },
-      new int[] { 0, 2, 3, 5,  9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32 },
-      new int[] { 2, 3, 5, 9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32, 33 }
+      new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス",  "魔女", "狩", "大将", "マシュー",  "ホプキンス"  },
+      new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
+      new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
     );
   }
 
@@ -116,9 +118,9 @@ public class TestKuromojiAnalyzer extend
     );
     
     assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない    ")),
-        new String[] { "これ", "は", "本", "で", "は", "ない", " ", " ", " ", " " },
-        new int[] { 0, 2, 3, 4, 5, 6, 8, 9, 10, 11 },
-        new int[] { 2, 3, 4, 5, 6, 8, 9, 10, 11, 12 },
+        new String[] { "これ", "は", "本", "で", "は", "ない"  },
+        new int[] { 0, 2, 3, 4, 5, 6, 8 },
+        new int[] { 2, 3, 4, 5, 6, 8, 9 },
         new Integer(12)
     );
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Tue Jan 10 18:04:53 2012
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -29,9 +28,6 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.zip.ZipFile;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -56,13 +52,12 @@ public class TestQuality extends LuceneT
      word agreement?: 0.999587584716181
      */
     final Segmenter segmenter = new Segmenter();
-    Analyzer testAnalyzer = new KuromojiAnalyzer(segmenter);
     
     String line1 = null;
     String line2 = null;
     while ((line1 = unseg.readLine()) != null) {
       line2 = seg.readLine();
-      evaluateLine(line1, line2, testAnalyzer, stats);
+      evaluateLine(line1, line2, segmenter, stats);
     }
     
     System.out.println("#words: " + stats.numWords);
@@ -84,15 +79,12 @@ public class TestQuality extends LuceneT
     long numSentencesCorrect = 0;
   }
   
-  public static void evaluateLine(String unseg, String seg, Analyzer analyzer, Stats stats) throws Exception {
+  public static void evaluateLine(String unseg, String seg, Segmenter segmenter, Stats stats) throws Exception {
     List<String> tokens = new ArrayList<String>();
-    TokenStream stream = analyzer.tokenStream("bogus", new StringReader(unseg));
-    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
-    stream.reset();
-    while (stream.incrementToken()) {
-      tokens.add(termAtt.toString());
+    List<Token> output = segmenter.tokenize(unseg);
+    for (Token t : output) {
+      tokens.add(t.getSurfaceFormString());
     }
-    stream.close();
     
     List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
     tokens = normalize(tokens);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Tue Jan 10 18:04:53 2012
@@ -69,9 +69,13 @@ public abstract class BinaryDictionaryWr
     
     // build up the POS string
     for (int i = 4; i < 8; i++) {
-      sb.append(CSVUtil.quoteEscape(entry[i]));
-      if (i < 7) {
-        sb.append(',');
+      String part = entry[i];
+      assert part.length() > 0;
+      if (!"*".equals(part)) {
+        if (sb.length() > 0) {
+          sb.append('-');
+        }
+        sb.append(part);
       }
     }
     String pos = sb.toString();

Modified: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Tue Jan 10 18:04:53 2012
@@ -42,7 +42,11 @@ public class KuromojiTokenizerFactory ex
     Mode mode = args.get(MODE) != null ? Mode.valueOf(args.get(MODE).toUpperCase(Locale.ENGLISH)) : Mode.NORMAL;
     String userDictionaryPath = args.get(USER_DICT_PATH);
     try {
-      this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+      if (userDictionaryPath != null) {
+        this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+      } else {
+        this.segmenter = new Segmenter(mode);
+      }
     } catch (Exception e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }