You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 19:04:54 UTC
svn commit: r1229660 [2/2] - in /lucene/dev/branches/lucene3305:
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/
modules/analysis/kuromoji/s...
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,410 @@
+# set of default stop tags:
+# uncomment a part of speech to treat those words as stopwords.
+# the entire tagset is provided here for convenience.
+#
+#####
+# noun: unclassified nouns
+#åè©
+#
+# noun-common: Common nouns or nouns where the sub-classification is undefined
+#åè©-ä¸è¬
+#
+# noun-proper: Proper nouns where the sub-classification is undefined
+#åè©-åºæåè©
+#
+# noun-proper-misc: miscellaneous proper nouns
+#åè©-åºæåè©-ä¸è¬
+#
+# noun-proper-person: Personal names where the sub-classification is undefined
+#åè©-åºæåè©-人å
+#
+# noun-proper-person-misc: names that cannot be divided into surname and
+# given name; foreign names; names where the surname or given name is unknown.
+# e.g. ãå¸ã®æ¹
+#åè©-åºæåè©-人å-ä¸è¬
+#
+# noun-proper-person-surname: Mainly Japanese surnames.
+# e.g. å±±ç°
+#åè©-åºæåè©-人å-å§
+#
+# noun-proper-person-given_name: Mainly Japanese given names.
+# e.g. 太é
+#åè©-åºæåè©-人å-å
+#
+# noun-proper-organization: Names representing organizations.
+# e.g. éç£ç, NHK
+#åè©-åºæåè©-çµç¹
+#
+# noun-proper-place: Place names where the sub-classification is undefined
+#åè©-åºæåè©-å°å
+#
+# noun-proper-place-misc: Place names excluding countries.
+# e.g. ã¢ã¸ã¢, ãã«ã»ãã, 京é½
+#åè©-åºæåè©-å°å-ä¸è¬
+#
+# noun-proper-place-country: Country names.
+# e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢
+#åè©-åºæåè©-å°å-å½
+#
+# noun-pronoun: Pronouns where the sub-classification is undefined
+#åè©-代åè©
+#
+# noun-pronoun-misc: miscellaneous pronouns:
+# e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã
+#åè©-代åè©-ä¸è¬
+#
+# noun-pronoun-contraction: Spoken language contraction made by combining a
+# pronoun and the particle 'wa'.
+# e.g. ããã, ããã, ãããã, ããã, ãããã
+#åè©-代åè©-縮ç´
+#
+# noun-adverbial: Temporal nouns such as names of days or months that behave
+# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+# e.g. éæ, ä¸æ, åå¾, å°é
+#åè©-å¯è©å¯è½
+#
+# noun-verbal: Nouns that take arguments with case and can appear followed by
+# 'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã)
+# e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã
+#åè©-ãµå¤æ¥ç¶
+#
+# noun-adjective-base: The base form of adjectives, words that appear before 㪠("na")
+# e.g. å¥åº·, å®æ, é§ç®, ã ã
+#åè©-形容åè©èªå¹¹
+#
+# noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°.
+# e.g. 0, 1, 2, ä½, æ°, å¹¾
+#åè©-æ°
+#
+# noun-affix: noun affixes where the sub-classification is undefined
+#åè©-éèªç«
+#
+# noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that
+# attach to the base form of inflectional words, words that cannot be classified
+# into any of the other categories below. This category includes indefinite nouns.
+# e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, 次第,
+# é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿,
+# æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è
, ãã, æ
, ããã, æ以, ãã, 訳,
+# ãã, å²ã, å², ã-å£èª/, ãã-å£èª/
+#åè©-éèªç«-ä¸è¬
+#
+# noun-affix-adverbial: noun affixes that that can behave as adverbs.
+# e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, 以å¤, 以é, 以å¾, 以ä¸, 以å, ä¸æ¹, ãã,
+# ä¸, ãã¡, å
, ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã,
+# æä¸, ããã, èªä½, ãã³, 度, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ,
+# ã¨ãã, é端, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾,
+# å, ä¾, ã¿ãã, ç¢å
+#åè©-éèªç«-å¯è©å¯è½
+#
+# noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars
+# with the stem ãã(ã ) ("you(da)").
+# e.g. ãã, ãã, æ§ (ãã)
+#åè©-éèªç«-å©åè©èªå¹¹
+#
+# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+# connection form 㪠(aux "da").
+# e.g. ã¿ãã, ãµã
+#åè©-éèªç«-形容åè©èªå¹¹
+#
+# noun-special: special nouns where the sub-classification is undefined.
+#åè©-ç¹æ®
+#
+# noun-special-aux: The ããã ("souda") stem form that is used for reporting news, is
+# treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base
+# form of inflectional words.
+# e.g. ãã
+#åè©-ç¹æ®-å©åè©èªå¹¹
+#
+# noun-suffix: noun suffixes where the sub-classification is undefined.
+#åè©-æ¥å°¾
+#
+# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
+# to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into
+# any of the other categories below. In general, this category is more inclusive than
+# æ¥å°¾èª ("suffix") and is usually the last element in a compound noun.
+# e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, 次第, æ¸ (ã) ã¿,
+# ãã, (ã§ã)ã£ã, æ, 観, æ§, å¦, é¡, é¢, ç¨
+#åè©-æ¥å°¾-ä¸è¬
+#
+# noun-suffix-person: Suffixes that form nouns and attach to person names more often
+# than other nouns.
+# e.g. å, æ§, è
+#åè©-æ¥å°¾-人å
+#
+# noun-suffix-place: Suffixes that form nouns and attach to place names more often
+# than other nouns.
+# e.g. çº, å¸, ç
+#åè©-æ¥å°¾-å°å
+#
+# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
+# can appear before ã¹ã« ("suru").
+# e.g. å, è¦, åã, å
¥ã, è½ã¡, è²·ã
+#åè©-æ¥å°¾-ãµå¤æ¥ç¶
+#
+# noun-suffix-aux: The stem form of ããã (æ§æ
) that is used to indicate conditions,
+# is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the
+# conjunctive form of inflectional words.
+# e.g. ãã
+#åè©-æ¥å°¾-å©åè©èªå¹¹
+#
+# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
+# form of inflectional words and appear before the copula ã ("da").
+# e.g. ç, ã, ãã¡
+#åè©-æ¥å°¾-形容åè©èªå¹¹
+#
+# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+# e.g. å¾ (ã), 以å¾, 以é, 以å, åå¾, ä¸, æ«, ä¸, æ (ã)
+#åè©-æ¥å°¾-å¯è©å¯è½
+#
+# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
+# is more inclusive than å©æ°è© ("classifier") and includes common nouns that attach
+# to numbers.
+# e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå
+#åè©-æ¥å°¾-å©æ°è©
+#
+# noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+# e.g. (楽ã) ã, (èã) æ¹
+#åè©-æ¥å°¾-ç¹æ®
+#
+# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
+# together.
+# e.g. (æ¥æ¬) 対 (ã¢ã¡ãªã«), 対 (ã¢ã¡ãªã«), (3) 対 (5), (女åª) å
¼ (主婦)
+#åè©-æ¥ç¶è©ç
+#
+# noun-verbal_aux: Nouns that attach to the conjunctive particle 㦠("te") and are
+# semantically verb-like.
+# e.g. ããã, ã覧, 御覧, é æ´
+#åè©-åè©éèªç«ç
+#
+# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
+# dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation")
+# is ããã ("iwaku").
+#åè©-å¼ç¨æåå
+#
+# noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and
+# behave like an adjective.
+# e.g. ç³ã訳, ä»æ¹, ã¨ãã§ã, éã
+#åè©-ãã¤å½¢å®¹è©èªå¹¹
+#
+#####
+# prefix: unclassified prefixes
+æ¥é è©
+#
+# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
+# excluding numerical expressions.
+# e.g. ã (æ°´), æ (æ°), å (社), æ
(ï½æ°), é« (å質), ã (è¦äº), ã (ç«æ´¾)
+æ¥é è©-åè©æ¥ç¶
+#
+# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+# in conjunctive form followed by ãªã/ãªãã/ãã ãã.
+# e.g. ã (èªã¿ãªãã), ã (座ã)
+æ¥é è©-åè©æ¥ç¶
+#
+# prefix-adjectival: Prefixes that attach to adjectives.
+# e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
+æ¥é è©-形容è©æ¥ç¶
+#
+# prefix-numerical: Prefixes that attach to numerical expressions.
+# e.g. ç´, ããã, æ¯æ
+æ¥é è©-æ°æ¥ç¶
+#
+#####
+# verb: unclassified verbs
+#åè©
+#
+# verb-main:
+#åè©-èªç«
+#
+# verb-auxiliary:
+åè©-éèªç«
+#
+# verb-suffix:
+#åè©-æ¥å°¾
+#
+#####
+# adjective: unclassified adjectives
+#形容è©
+#
+# adjective-main:
+#形容è©-èªç«
+#
+# adjective-auxiliary:
+#形容è©-éèªç«
+#
+# adjective-suffix:
+#形容è©-æ¥å°¾
+#
+#####
+# adverb: unclassified adverbs
+#å¯è©
+#
+# adverb-misc: Words that can be segmented into one unit and where adnominal
+# modification is not possible.
+# e.g. ãããããã, å¤å
+#å¯è©-ä¸è¬
+#
+# adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«,
+# ãª, ãã, ã , etc.
+# e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã
+#å¯è©-å©è©é¡æ¥ç¶
+#
+#####
+# adnominal: Words that only have noun-modifying forms.
+# e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã,
+# ã©ããã, ãããª, ãããª, ãããª, ã©ããª, 大ããª, å°ããª, ããããª, ã»ãã®, ãããã,
+# ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ã
ãã, å ã
ãã, åãªã, ãããªã, æãããåã, 亡ã
+#é£ä½è©
+#
+#####
+# conjunction: Conjunctions that can occur independently.
+# e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã
+æ¥ç¶è©
+#
+#####
+# particle: unclassified particles.
+å©è©
+#
+# particle-case: case particles where the subclassification is undefined.
+å©è©-æ ¼å©è©
+#
+# particle-case-misc: Case particles.
+# e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦
+å©è©-æ ¼å©è©-ä¸è¬
+#
+# particle-case-quote: the "to" that appears after nouns, a personâs speech,
+# quotation marks, expressions of decisions from a meeting, reasons, judgements,
+# conjectures, etc.
+# e.g. ( ã ) 㨠(è¿°ã¹ã.), ( ã§ãã) 㨠(ãã¦å·è¡ç¶äº...)
+å©è©-æ ¼å©è©-å¼ç¨
+#
+# particle-case-compound: Compounds of particles and verbs that mainly behave
+# like case particles.
+# e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å
±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦,
+# ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã,
+# ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã,
+# ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦,
+# ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦,
+# ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã,
+# ã«ããã£ã¦, ã«ããã, ããã£ã¦, ã以ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã,
+# ã£ã¦-å£èª/, ã¡ã
ã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (人)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ
+å©è©-æ ¼å©è©-é£èª
+#
+# particle-conjunctive:
+# e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã,
+# ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/,
+# (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/
+å©è©-æ¥ç¶å©è©
+#
+# particle-dependency:
+# e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã
+å©è©-ä¿å©è©
+#
+# particle-adverbial:
+# e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (å¦æ ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/,
+# (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (å
ç) ãªãã (大å«ã)-å£èª/,
+# (ç§) ãªãã, (å
ç) ãªã㦠(大å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«,
+# (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/,
+# ã»ã©, ç¨, ã¾ã§, è¿, (誰) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã)
+å©è©-å¯å©è©
+#
+# particle-interjective: particles with interjective grammatical roles.
+# e.g. (æ¾å³¶) ã
+å©è©-éæå©è©
+#
+# particle-coordinate:
+# e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã
+å©è©-並ç«å©è©
+#
+# particle-final:
+# e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã,
+# ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/
+å©è©-çµå©è©
+#
+# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
+# adverbial, conjunctive, or sentence final. For example:
+# (a) ãA ã B ãã. Ex:ã(å½å
ã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã
+# (b) Inside an adverb phrase. Ex:ã(幸ãã¨ãã) ã (, æ»è
ã¯ããªãã£ã.)ã
+# ã(ç¥ããå±ãããã) ã (, 試é¨ã«åæ ¼ãã.)ã
+# (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã
+# e.g. ã
+å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©
+#
+# particle-adnominalizer: The "no" that attaches to nouns and modifies
+# non-inflectional words.
+å©è©-é£ä½å
+#
+# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
+# that are giongo, giseigo, or gitaigo.
+# e.g. ã«, ã¨
+å©è©-å¯è©å
+#
+# particle-special: A particle that does not fit into one of the above classifications.
+# This includes particles that are used in Tanka, Haiku, and other poetry.
+# e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (俺) ã (家)
+å©è©-ç¹æ®
+#
+#####
+# auxiliary-verb:
+å©åè©
+#
+#####
+# interjection: Greetings and other exclamations.
+# e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã,
+# ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã
+æåè©
+#
+#####
+# symbol: unclassified Symbols.
+#è¨å·
+#
+# symbol-misc: A general symbol not in one of the categories below.
+# e.g. [ââ@$ãâ+]
+è¨å·-ä¸è¬
+#
+# symbol-comma: Commas
+# e.g. [,ã]
+è¨å·-èªç¹
+#
+# symbol-period: Periods and full stops.
+# e.g. [.ï¼ã]
+è¨å·-å¥ç¹
+#
+# symbol-space: Full-width whitespace.
+è¨å·-空ç½
+#
+# symbol-open_bracket:
+# e.g. [({ââãã]
+è¨å·-æ¬å¼§é
+#
+# symbol-close_bracket:
+# e.g. [)}ââããã]
+è¨å·-æ¬å¼§é
+#
+# symbol-alphabetic:
+#è¨å·-ã¢ã«ãã¡ããã
+#
+#####
+# other: unclassified other
+#ãã®ä»
+#
+# other-interjection: Words that are hard to classify as noun-suffixes or
+# sentence-final particles.
+# e.g. (ã )ã¡
+ãã®ä»-éæ
+#
+#####
+# filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+# e.g. ãã®, ããã¨, ãã¨
+ãã£ã©ã¼
+#
+#####
+# non-verbal: non-verbal sound.
+éè¨èªé³
+#
+#####
+# fragment:
+#èªæç
+#
+#####
+# unknown: unknown part of speech.
+#æªç¥èª
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,13 @@
+# short set of japanese stopwords
+ãã
+ãã
+人ç©
+ãã¾
+ãããã¨
+ãã
+ãã®
+ããã¦
+ãªã
+ã§ãã
+ãã
+ãã
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Tue Jan 10 18:04:53 2012
@@ -165,15 +165,15 @@ public class SegmenterTest extends Lucen
public void testPartOfSpeech() {
List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
assertEquals(9, tokens.size());
- assertEquals("åè©,代åè©,ä¸è¬,*", tokens.get(0).getPartOfSpeech());
- assertEquals("å©è©,ä¿å©è©,*,*", tokens.get(1).getPartOfSpeech());
- assertEquals("å¯è©,å©è©é¡æ¥ç¶,*,*", tokens.get(2).getPartOfSpeech());
- assertEquals("åè©,ãµå¤æ¥ç¶,*,*", tokens.get(3).getPartOfSpeech());
- assertEquals("åè©,ä¸è¬,*,*", tokens.get(4).getPartOfSpeech());
- assertEquals("å©è©,æ ¼å©è©,ä¸è¬,*", tokens.get(5).getPartOfSpeech());
- assertEquals("åè©,èªç«,*,*", tokens.get(6).getPartOfSpeech());
- assertEquals("å©åè©,*,*,*", tokens.get(7).getPartOfSpeech());
- assertEquals("è¨å·,å¥ç¹,*,*", tokens.get(8).getPartOfSpeech());
+ assertEquals("åè©-代åè©-ä¸è¬", tokens.get(0).getPartOfSpeech());
+ assertEquals("å©è©-ä¿å©è©", tokens.get(1).getPartOfSpeech());
+ assertEquals("å¯è©-å©è©é¡æ¥ç¶", tokens.get(2).getPartOfSpeech());
+ assertEquals("åè©-ãµå¤æ¥ç¶", tokens.get(3).getPartOfSpeech());
+ assertEquals("åè©-ä¸è¬", tokens.get(4).getPartOfSpeech());
+ assertEquals("å©è©-æ ¼å©è©-ä¸è¬", tokens.get(5).getPartOfSpeech());
+ assertEquals("åè©-èªç«", tokens.get(6).getPartOfSpeech());
+ assertEquals("å©åè©", tokens.get(7).getPartOfSpeech());
+ assertEquals("è¨å·-å¥ç¹", tokens.get(8).getPartOfSpeech());
}
public void testBocchan() throws Exception {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java Tue Jan 10 18:04:53 2012
@@ -14,8 +14,7 @@ import org.apache.lucene.util.Version;
public class SimpleBench {
public static void main(String args[]) throws Exception {
- Segmenter segmenter = new Segmenter();
- Analyzer a = new KuromojiAnalyzer(segmenter);
+ Analyzer a = new KuromojiAnalyzer(Version.LUCENE_CURRENT);
Analyzer b = new CJKAnalyzer(Version.LUCENE_CURRENT);
/* slight warmup */
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java Tue Jan 10 18:04:53 2012
@@ -25,24 +25,17 @@ import org.apache.lucene.analysis.BaseTo
import org.apache.lucene.analysis.Tokenizer;
public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
- private Analyzer analyzer;
-
- public void setUp() throws Exception {
- super.setUp();
- final Segmenter segmenter = new Segmenter();
- analyzer = new Analyzer() {
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
- return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
- }
- };
- }
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
+ }
+ };
public void testBasics() throws IOException {
- assertAnalyzesTo(analyzer, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
- new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã", "ã" }
+ assertAnalyzesTo(analyzer, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã",
+ new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" }
);
}
Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (from r1229589, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java&r1=1229589&r2=1229660&rev=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Tue Jan 10 18:04:53 2012
@@ -17,35 +17,37 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
+import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
-public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
- private Analyzer analyzer;
-
- public void setUp() throws Exception {
- super.setUp();
- final Segmenter segmenter = new Segmenter();
- analyzer = new KuromojiAnalyzer(segmenter);
- }
+public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
public void testDecomposition1() throws Exception {
assertAnalyzesTo(analyzer, "æ¬æ¥ã¯ã貧å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
"ã¢ã¡ãªã«ä½æå¾è
å»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãè人ã«è²»ããã¦ããã",
- new String[] { "æ¬æ¥", "ã¯", "ã", "貧å°", "層", "ã®", "女æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",
- "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã", "ã", "ã¢ã¡ãªã«",
- "ä½", "æå¾", "è
", "å»ç", "æ´å©", "å¶åº¦", "ã", "ã", "ä»æ¥", "ã§", "ã¯", "ã", "ãã®",
- "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", "è人", "ã«", "è²»ãã", "ã¦", "ãã", "ã" },
- new int[] { 0, 2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30,
- 31, 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60,
- 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 },
- new int[] { 2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
- 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60, 62,
- 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78, 79 }
+ new String[] { "æ¬æ¥", "ã¯", "貧å°", "層", "ã®", "女æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",
+ "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã", "ã¢ã¡ãªã«",
+ "ä½", "æå¾", "è
", "å»ç", "æ´å©", "å¶åº¦", "ã", "ä»æ¥", "ã§", "ã¯", "ãã®",
+ "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", "è人", "ã«", "è²»ãã", "ã¦", "ãã" },
+ new int[] { 0, 2, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30,
+ 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60,
+ 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
+ new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
+ 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62,
+ 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
);
}
@@ -59,9 +61,9 @@ public class TestKuromojiAnalyzer extend
public void testDecomposition3() throws Exception {
assertAnalyzesTo(analyzer, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
- new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã" },
- new int[] { 0, 2, 3, 5, 9, 10, 15 },
- new int[] { 2, 3, 5, 9, 10, 15, 16 }
+ new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" },
+ new int[] { 0, 2, 3, 5, 10 },
+ new int[] { 2, 3, 5, 9, 15 }
);
}
@@ -84,9 +86,9 @@ public class TestKuromojiAnalyzer extend
/** Tests that sentence offset is incorporated into the resulting offsets */
public void testTwoSentences() throws Exception {
assertAnalyzesTo(analyzer, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
- new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã", " ", "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã" },
- new int[] { 0, 2, 3, 5, 9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32 },
- new int[] { 2, 3, 5, 9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32, 33 }
+ new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹", "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" },
+ new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
+ new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
);
}
@@ -116,9 +118,9 @@ public class TestKuromojiAnalyzer extend
);
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã ")),
- new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã", " ", " ", " ", " " },
- new int[] { 0, 2, 3, 4, 5, 6, 8, 9, 10, 11 },
- new int[] { 2, 3, 4, 5, 6, 8, 9, 10, 11, 12 },
+ new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
+ new int[] { 0, 2, 3, 4, 5, 6, 8 },
+ new int[] { 2, 3, 4, 5, 6, 8, 9 },
new Integer(12)
);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Tue Jan 10 18:04:53 2012
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -29,9 +28,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.zip.ZipFile;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
@@ -56,13 +52,12 @@ public class TestQuality extends LuceneT
word agreement?: 0.999587584716181
*/
final Segmenter segmenter = new Segmenter();
- Analyzer testAnalyzer = new KuromojiAnalyzer(segmenter);
String line1 = null;
String line2 = null;
while ((line1 = unseg.readLine()) != null) {
line2 = seg.readLine();
- evaluateLine(line1, line2, testAnalyzer, stats);
+ evaluateLine(line1, line2, segmenter, stats);
}
System.out.println("#words: " + stats.numWords);
@@ -84,15 +79,12 @@ public class TestQuality extends LuceneT
long numSentencesCorrect = 0;
}
- public static void evaluateLine(String unseg, String seg, Analyzer analyzer, Stats stats) throws Exception {
+ public static void evaluateLine(String unseg, String seg, Segmenter segmenter, Stats stats) throws Exception {
List<String> tokens = new ArrayList<String>();
- TokenStream stream = analyzer.tokenStream("bogus", new StringReader(unseg));
- CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
- stream.reset();
- while (stream.incrementToken()) {
- tokens.add(termAtt.toString());
+ List<Token> output = segmenter.tokenize(unseg);
+ for (Token t : output) {
+ tokens.add(t.getSurfaceFormString());
}
- stream.close();
List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
tokens = normalize(tokens);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Tue Jan 10 18:04:53 2012
@@ -69,9 +69,13 @@ public abstract class BinaryDictionaryWr
// build up the POS string
for (int i = 4; i < 8; i++) {
- sb.append(CSVUtil.quoteEscape(entry[i]));
- if (i < 7) {
- sb.append(',');
+ String part = entry[i];
+ assert part.length() > 0;
+ if (!"*".equals(part)) {
+ if (sb.length() > 0) {
+ sb.append('-');
+ }
+ sb.append(part);
}
}
String pos = sb.toString();
Modified: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Tue Jan 10 18:04:53 2012
@@ -42,7 +42,11 @@ public class KuromojiTokenizerFactory ex
Mode mode = args.get(MODE) != null ? Mode.valueOf(args.get(MODE).toUpperCase(Locale.ENGLISH)) : Mode.NORMAL;
String userDictionaryPath = args.get(USER_DICT_PATH);
try {
- this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+ if (userDictionaryPath != null) {
+ this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+ } else {
+ this.segmenter = new Segmenter(mode);
+ }
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}