You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 19:04:54 UTC
svn commit: r1229660 [2/2] - in /lucene/dev/branches/lucene3305: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ modules/analysis/kuromoji/s...

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,410 @@
+# set of default stop tags:
+# uncomment a part of speech to treat those words as stopwords.
+# the entire tagset is provided here for convenience.
+#
+#####
+#  noun: unclassified nouns
+#åè©
+#
+#  noun-common: Common nouns or nouns where the sub-classification is undefined
+#åè©-ä¸è¬
+#
+#  noun-proper: Proper nouns where the sub-classification is undefined 
+#åè©-åºæåè©
+#
+#  noun-proper-misc: miscellaneous proper nouns
+#åè©-åºæåè©-ä¸è¬
+#
+#  noun-proper-person: Personal names where the sub-classification is undefined
+#åè©-åºæåè©-äººå
+#
+#  noun-proper-person-misc: names that cannot be divided into surname and 
+#  given name; foreign names; names where the surname or given name is unknown.
+#  e.g. ãå¸ã®æ¹
+#åè©-åºæåè©-äººå-ä¸è¬
+#
+#  noun-proper-person-surname: Mainly Japanese surnames.
+#  e.g. å±±ç°
+#åè©-åºæåè©-äººå-å§
+#
+#  noun-proper-person-given_name: Mainly Japanese given names.
+#  e.g. å¤ªé
+#åè©-åºæåè©-äººå-å
+#
+#  noun-proper-organization: Names representing organizations.
+#  e.g. éç£ç, NHK
+#åè©-åºæåè©-çµç¹
+#
+#  noun-proper-place: Place names where the sub-classification is undefined
+#åè©-åºæåè©-å°å
+#
+#  noun-proper-place-misc: Place names excluding countries.
+#  e.g. ã¢ã¸ã¢, ãã«ã»ãã, äº¬é½
+#åè©-åºæåè©-å°å-ä¸è¬
+#
+#  noun-proper-place-country: Country names. 
+#  e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢
+#åè©-åºæåè©-å°å-å½
+#
+#  noun-pronoun: Pronouns where the sub-classification is undefined
+#åè©-ä»£åè©
+#
+#  noun-pronoun-misc: miscellaneous pronouns: 
+#  e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã
+#åè©-ä»£åè©-ä¸è¬
+#
+#  noun-pronoun-contraction: Spoken language contraction made by combining a 
+#  pronoun and the particle 'wa'.
+#  e.g. ããã, ããã, ãããã, ããã, ãããã 
+#åè©-ä»£åè©-ç¸®ç´
+#
+#  noun-adverbial: Temporal nouns such as names of days or months that behave 
+#  like adverbs. Nouns that represent amount or ratios and can be used adverbially,
+#  e.g. éæ, ä¸æ, åå¾, å°é
+#åè©-å¯è©å¯è½
+#
+#  noun-verbal: Nouns that take arguments with case and can appear followed by 
+#  'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã)
+#  e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã
+#åè©-ãµå¤æ¥ç¶
+#
+#  noun-adjective-base: The base form of adjectives, words that appear before ãª ("na")
+#  e.g. å¥åº·, å®æ, é§ç®, ã ã
+#åè©-å½¢å®¹åè©èªå¹¹
+#
+#  noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°.
+#  e.g. 0, 1, 2, ä½, æ°, å¹¾
+#åè©-æ°
+#
+#  noun-affix: noun affixes where the sub-classification is undefined
+#åè©-éèªç«
+#
+#  noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that 
+#  attach to the base form of inflectional words, words that cannot be classified 
+#  into any of the other categories below. This category includes indefinite nouns.
+#  e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, æ¬¡ç¬¬, 
+#       é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿, 
+#       æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è, ãã, æ, ããã, æä»¥, ãã, è¨³,
+#       ãã, å²ã, å², ã-å£èª/, ãã-å£èª/
+#åè©-éèªç«-ä¸è¬
+#
+#  noun-affix-adverbial: noun affixes that that can behave as adverbs.
+#  e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, ä»¥å¤, ä»¥é, ä»¥å¾, ä»¥ä¸, ä»¥å, ä¸æ¹, ãã, 
+#       ä¸, ãã¡, å, ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã, 
+#       æä¸, ããã, èªä½, ãã³, åº¦, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ, 
+#       ã¨ãã, éç«¯, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾, 
+#       å, ä¾, ã¿ãã, ç¢å
+#åè©-éèªç«-å¯è©å¯è½
+#
+#  noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars 
+#  with the stem ãã(ã ) ("you(da)").
+#  e.g.  ãã, ãã, æ§ (ãã)
+#åè©-éèªç«-å©åè©èªå¹¹
+#  
+#  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+#  connection form ãª (aux "da").
+#  e.g. ã¿ãã, ãµã
+#åè©-éèªç«-å½¢å®¹åè©èªå¹¹
+#
+#  noun-special: special nouns where the sub-classification is undefined.
+#åè©-ç¹æ®
+#
+#  noun-special-aux: The ããã  ("souda") stem form that is used for reporting news, is 
+#  treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base 
+#  form of inflectional words.
+#  e.g. ãã
+#åè©-ç¹æ®-å©åè©èªå¹¹
+#
+#  noun-suffix: noun suffixes where the sub-classification is undefined.
+#åè©-æ¥å°¾
+#
+#  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect 
+#  to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into
+#  any of the other categories below. In general, this category is more inclusive than 
+#  æ¥å°¾èª ("suffix") and is usually the last element in a compound noun.
+#  e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, æ¬¡ç¬¬, æ¸ (ã) ã¿,
+#       ãã, (ã§ã)ã£ã, æ, è¦³, æ§, å¦, é¡, é¢, ç¨
+#åè©-æ¥å°¾-ä¸è¬
+#
+#  noun-suffix-person: Suffixes that form nouns and attach to person names more often
+#  than other nouns.
+#  e.g. å, æ§, è
+#åè©-æ¥å°¾-äººå
+#
+#  noun-suffix-place: Suffixes that form nouns and attach to place names more often 
+#  than other nouns.
+#  e.g. çº, å¸, ç
+#åè©-æ¥å°¾-å°å
+#
+#  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that 
+#  can appear before ã¹ã« ("suru").
+#  e.g. å, è¦, åã, å¥ã, è½ã¡, è²·ã
+#åè©-æ¥å°¾-ãµå¤æ¥ç¶
+#
+#  noun-suffix-aux: The stem form of ããã  (æ§æ) that is used to indicate conditions, 
+#  is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the 
+#  conjunctive form of inflectional words.
+#  e.g. ãã
+#åè©-æ¥å°¾-å©åè©èªå¹¹
+#
+#  noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive 
+#  form of inflectional words and appear before the copula ã  ("da").
+#  e.g. ç, ã, ãã¡
+#åè©-æ¥å°¾-å½¢å®¹åè©èªå¹¹
+#
+#  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
+#  e.g. å¾ (ã), ä»¥å¾, ä»¥é, ä»¥å, åå¾, ä¸, æ«, ä¸, æ (ã)
+#åè©-æ¥å°¾-å¯è©å¯è½
+#
+#  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category 
+#  is more inclusive than å©æ°è© ("classifier") and includes common nouns that attach 
+#  to numbers.
+#  e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå
+#åè©-æ¥å°¾-å©æ°è©
+#
+#  noun-suffix-special: Special suffixes that mainly attach to inflecting words.
+#  e.g. (æ¥½ã) ã, (èã) æ¹
+#åè©-æ¥å°¾-ç¹æ®
+#
+#  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words 
+#  together.
+#  e.g. (æ¥æ¬) å¯¾ (ã¢ã¡ãªã«), å¯¾ (ã¢ã¡ãªã«), (3) å¯¾ (5), (å¥³åª) å¼ (ä¸»å©¦)
+#åè©-æ¥ç¶è©ç
+#
+#  noun-verbal_aux: Nouns that attach to the conjunctive particle ã¦ ("te") and are 
+#  semantically verb-like.
+#  e.g. ããã, ãè¦§, å¾¡è¦§, é æ´
+#åè©-åè©éèªç«ç
+#
+#  noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, 
+#  dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation") 
+#  is ããã ("iwaku").
+#åè©-å¼ç¨æåå
+#
+#  noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and
+#  behave like an adjective.
+#  e.g. ç³ãè¨³, ä»æ¹, ã¨ãã§ã, éã
+#åè©-ãã¤å½¢å®¹è©èªå¹¹
+#
+#####
+#  prefix: unclassified prefixes
+æ¥é è©
+#
+#  prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) 
+#  excluding numerical expressions.
+#  e.g. ã (æ°´), æ (æ°), å (ç¤¾), æ (ï½æ°), é« (åè³ª), ã (è¦äº), ã (ç«æ´¾)
+æ¥é è©-åè©æ¥ç¶
+#
+#  prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
+#  in conjunctive form followed by ãªã/ãªãã/ãã ãã.
+#  e.g. ã (èªã¿ãªãã), ã (åº§ã)
+æ¥é è©-åè©æ¥ç¶
+#
+#  prefix-adjectival: Prefixes that attach to adjectives.
+#  e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
+æ¥é è©-å½¢å®¹è©æ¥ç¶
+#
+#  prefix-numerical: Prefixes that attach to numerical expressions.
+#  e.g. ç´, ããã, æ¯æ
+æ¥é è©-æ°æ¥ç¶
+#
+#####
+#  verb: unclassified verbs
+#åè©
+#
+#  verb-main:
+#åè©-èªç«
+#
+#  verb-auxiliary:
+åè©-éèªç«
+#
+#  verb-suffix:
+#åè©-æ¥å°¾
+#
+#####
+#  adjective: unclassified adjectives
+#å½¢å®¹è©
+#
+#  adjective-main:
+#å½¢å®¹è©-èªç«
+#
+#  adjective-auxiliary:
+#å½¢å®¹è©-éèªç«
+#
+#  adjective-suffix:
+#å½¢å®¹è©-æ¥å°¾
+#
+#####
+#  adverb: unclassified adverbs
+#å¯è©
+#
+#  adverb-misc: Words that can be segmented into one unit and where adnominal 
+#  modification is not possible.
+#  e.g. ãããããã, å¤å
+#å¯è©-ä¸è¬
+#
+#  adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«, 
+#  ãª, ãã, ã , etc.
+#  e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã
+#å¯è©-å©è©é¡æ¥ç¶
+#
+#####
+#  adnominal: Words that only have noun-modifying forms.
+#  e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã, 
+#       ã©ããã, ãããª, ãããª, ãããª, ã©ããª, å¤§ããª, å°ããª, ããããª, ã»ãã®, ãããã, 
+#       ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ããã, å ããã, åãªã, ãããªã, æãããåã, äº¡ã
+#é£ä½è©
+#
+#####
+#  conjunction: Conjunctions that can occur independently.
+#  e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã
+æ¥ç¶è©
+#
+#####
+#  particle: unclassified particles.
+å©è©
+#
+#  particle-case: case particles where the subclassification is undefined.
+å©è©-æ ¼å©è©
+#
+#  particle-case-misc: Case particles.
+#  e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦
+å©è©-æ ¼å©è©-ä¸è¬
+#
+#  particle-case-quote: the "to" that appears after nouns, a personâs speech, 
+#  quotation marks, expressions of decisions from a meeting, reasons, judgements,
+#  conjectures, etc.
+#  e.g. ( ã ) ã¨ (è¿°ã¹ã.), ( ã§ãã) ã¨ (ãã¦å·è¡ç¶äº...)
+å©è©-æ ¼å©è©-å¼ç¨
+#
+#  particle-case-compound: Compounds of particles and verbs that mainly behave 
+#  like case particles.
+#  e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦,
+#       ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã, 
+#       ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã, 
+#       ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦, 
+#       ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦,
+#       ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã, 
+#       ã«ããã£ã¦, ã«ããã, ããã£ã¦, ãä»¥ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã,
+#       ã£ã¦-å£èª/, ã¡ãã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (äºº)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ
+å©è©-æ ¼å©è©-é£èª
+#
+#  particle-conjunctive:
+#  e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã, 
+#       ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/, 
+#       (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/
+å©è©-æ¥ç¶å©è©
+#
+#  particle-dependency:
+#  e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã
+å©è©-ä¿å©è©
+#
+#  particle-adverbial:
+#  e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (å¦æ ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/, 
+#       (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (åç) ãªãã (å¤§å«ã)-å£èª/,
+#       (ç§) ãªãã, (åç) ãªãã¦ (å¤§å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«, 
+#       (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/,
+#       ã»ã©, ç¨, ã¾ã§, è¿, (èª°) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã)
+å©è©-å¯å©è©
+#
+#  particle-interjective: particles with interjective grammatical roles.
+#  e.g. (æ¾å³¶) ã
+å©è©-éæå©è©
+#
+#  particle-coordinate:
+#  e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã
+å©è©-ä¸¦ç«å©è©
+#
+#  particle-final:
+#  e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã, 
+#       ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/
+å©è©-çµå©è©
+#
+#  particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is 
+#  adverbial, conjunctive, or sentence final. For example:
+#       (a) ãA ã B ãã. Ex:ã(å½åã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã
+#       (b) Inside an adverb phrase. Ex:ã(å¹¸ãã¨ãã) ã (, æ»èã¯ããªãã£ã.)ã
+#           ã(ç¥ããå±ãããã) ã (, è©¦é¨ã«åæ ¼ãã.)ã
+#       (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã
+#  e.g. ã
+å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©
+#
+#  particle-adnominalizer: The "no" that attaches to nouns and modifies 
+#  non-inflectional words.
+å©è©-é£ä½å
+#
+#  particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs 
+#  that are giongo, giseigo, or gitaigo.
+#  e.g. ã«, ã¨
+å©è©-å¯è©å
+#
+#  particle-special: A particle that does not fit into one of the above classifications. 
+#  This includes particles that are used in Tanka, Haiku, and other poetry.
+#  e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (ä¿º) ã (å®¶)
+å©è©-ç¹æ®
+#
+#####
+#  auxiliary-verb:
+å©åè©
+#
+#####
+#  interjection: Greetings and other exclamations.
+#  e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã, 
+#       ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã
+æåè©
+#
+#####
+#  symbol: unclassified Symbols.
+#è¨å·
+#
+#  symbol-misc: A general symbol not in one of the categories below.
+#  e.g. [ââ@$ãâ+]
+è¨å·-ä¸è¬
+#
+#  symbol-comma: Commas
+#  e.g. [,ã]
+è¨å·-èªç¹
+#
+#  symbol-period: Periods and full stops.
+#  e.g. [.ï¼ã]
+è¨å·-å¥ç¹
+#
+#  symbol-space: Full-width whitespace.
+è¨å·-ç©ºç½
+#
+#  symbol-open_bracket:
+#  e.g. [({ââãã]
+è¨å·-æ¬å¼§é
+#
+#  symbol-close_bracket:
+#  e.g. [)}ââããã]
+è¨å·-æ¬å¼§é
+#
+#  symbol-alphabetic:
+#è¨å·-ã¢ã«ãã¡ããã
+#
+#####
+#  other: unclassified other
+#ãã®ä»
+#
+#  other-interjection: Words that are hard to classify as noun-suffixes or 
+#  sentence-final particles.
+#  e.g. (ã )ã¡
+ãã®ä»-éæ
+#
+#####
+#  filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
+#  e.g. ãã®, ããã¨, ãã¨
+ãã£ã©ã¼
+#
+#####
+#  non-verbal: non-verbal sound.
+éè¨èªé³
+#
+#####
+#  fragment:
+#èªæç
+#
+#####
+#  unknown: unknown part of speech.
+#æªç¥èª

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt Tue Jan 10 18:04:53 2012
@@ -0,0 +1,13 @@
+# short set of japanese stopwords
+ãã
+ãã
+äººç©
+ãã¾
+ãããã¨
+ãã
+ãã®
+ããã¦
+ãªã
+ã§ãã
+ãã
+ãã

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Tue Jan 10 18:04:53 2012
@@ -165,15 +165,15 @@ public class SegmenterTest extends Lucen
   public void testPartOfSpeech() {
     List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
     assertEquals(9, tokens.size());
-    assertEquals("åè©,ä»£åè©,ä¸è¬,*",  tokens.get(0).getPartOfSpeech());
-    assertEquals("å©è©,ä¿å©è©,*,*",    tokens.get(1).getPartOfSpeech());
-    assertEquals("å¯è©,å©è©é¡æ¥ç¶,*,*", tokens.get(2).getPartOfSpeech());
-    assertEquals("åè©,ãµå¤æ¥ç¶,*,*",   tokens.get(3).getPartOfSpeech());
-    assertEquals("åè©,ä¸è¬,*,*",      tokens.get(4).getPartOfSpeech());
-    assertEquals("å©è©,æ ¼å©è©,ä¸è¬,*",  tokens.get(5).getPartOfSpeech());
-    assertEquals("åè©,èªç«,*,*",      tokens.get(6).getPartOfSpeech());
-    assertEquals("å©åè©,*,*,*",       tokens.get(7).getPartOfSpeech());
-    assertEquals("è¨å·,å¥ç¹,*,*",      tokens.get(8).getPartOfSpeech());
+    assertEquals("åè©-ä»£åè©-ä¸è¬",  tokens.get(0).getPartOfSpeech());
+    assertEquals("å©è©-ä¿å©è©",    tokens.get(1).getPartOfSpeech());
+    assertEquals("å¯è©-å©è©é¡æ¥ç¶", tokens.get(2).getPartOfSpeech());
+    assertEquals("åè©-ãµå¤æ¥ç¶",   tokens.get(3).getPartOfSpeech());
+    assertEquals("åè©-ä¸è¬",      tokens.get(4).getPartOfSpeech());
+    assertEquals("å©è©-æ ¼å©è©-ä¸è¬",  tokens.get(5).getPartOfSpeech());
+    assertEquals("åè©-èªç«",      tokens.get(6).getPartOfSpeech());
+    assertEquals("å©åè©",       tokens.get(7).getPartOfSpeech());
+    assertEquals("è¨å·-å¥ç¹",      tokens.get(8).getPartOfSpeech());
   }
   
   public void testBocchan() throws Exception {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java Tue Jan 10 18:04:53 2012
@@ -14,8 +14,7 @@ import org.apache.lucene.util.Version;
 public class SimpleBench {
   
   public static void main(String args[]) throws Exception {
-    Segmenter segmenter = new Segmenter();
-    Analyzer a = new KuromojiAnalyzer(segmenter);
+    Analyzer a = new KuromojiAnalyzer(Version.LUCENE_CURRENT);
     Analyzer b = new CJKAnalyzer(Version.LUCENE_CURRENT);
     
     /* slight warmup */

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java Tue Jan 10 18:04:53 2012
@@ -25,24 +25,17 @@ import org.apache.lucene.analysis.BaseTo
 import org.apache.lucene.analysis.Tokenizer;
 
 public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
-
-  public void setUp() throws Exception {
-    super.setUp();
-    final Segmenter segmenter = new Segmenter();
-    analyzer = new Analyzer() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
-        return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
-      }
-    };
-  }
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
+    }
+  };
   
   public void testBasics() throws IOException {
-    assertAnalyzesTo(analyzer, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
-        new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "æ®µé", "ã«", "ãã", "ã¾ã", "ã" }
+    assertAnalyzesTo(analyzer, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã",
+        new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "æ®µé", "ã«", "ãã", "ã¾ã"  }
     );
   }
   

Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (from r1229589, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java&r1=1229589&r2=1229660&rev=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Tue Jan 10 18:04:53 2012
@@ -17,35 +17,37 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import java.io.Reader;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util._TestUtil;
 
-public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
-  private Analyzer analyzer;
-
-  public void setUp() throws Exception {
-    super.setUp();
-    final Segmenter segmenter = new Segmenter();
-    analyzer = new KuromojiAnalyzer(segmenter);
-  }
+public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
   
   public void testDecomposition1() throws Exception {
     assertAnalyzesTo(analyzer, "æ¬æ¥ã¯ãè²§å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
                          "ã¢ã¡ãªã«ä½æå¾èå»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãèäººã«è²»ããã¦ããã",
-     new String[] { "æ¬æ¥", "ã¯", "ã", "è²§å°", "å±¤", "ã®", "å¥³æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",      
-                    "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã", "ã", "ã¢ã¡ãªã«", 
-                    "ä½", "æå¾", "è", "å»ç", "æ´å©", "å¶åº¦", "ã", "ã", "ä»æ¥", "ã§", "ã¯", "ã", "ãã®",
-                    "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", "èäºº", "ã«", "è²»ãã", "ã¦", "ãã", "ã" },
-     new int[] { 0, 2, 3, 4, 6, 7,  8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 
-                 31, 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60,
-                 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 },
-     new int[] { 2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
-                 33, 34, 36, 37, 41, 42, 44, 45, 47, 49, 51, 52, 53, 55, 56, 57, 58, 60, 62,
-                 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78, 79 }
+     new String[] { "æ¬æ¥", "ã¯",  "è²§å°", "å±¤", "ã®", "å¥³æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",      
+                    "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã",  "ã¢ã¡ãªã«", 
+                    "ä½", "æå¾", "è", "å»ç", "æ´å©", "å¶åº¦", "ã",  "ä»æ¥", "ã§", "ã¯",  "ãã®",
+                    "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", "èäºº", "ã«", "è²»ãã", "ã¦", "ãã" },
+     new int[] { 0, 2, 4, 6, 7,  8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 
+                 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60,
+                 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
+     new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
+                 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62,
+                 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
     );
   }
   
@@ -59,9 +61,9 @@ public class TestKuromojiAnalyzer extend
   
   public void testDecomposition3() throws Exception {
     assertAnalyzesTo(analyzer, "éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
-      new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã" },
-      new int[] { 0, 2, 3, 5,  9, 10, 15 },
-      new int[] { 2, 3, 5, 9, 10, 15, 16 }
+      new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼",  "ãããã³ã¹" },
+      new int[] { 0, 2, 3, 5, 10 },
+      new int[] { 2, 3, 5, 9, 15 }
     );
   }
 
@@ -84,9 +86,9 @@ public class TestKuromojiAnalyzer extend
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
     assertAnalyzesTo(analyzer, "éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
-      new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã", " ", "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", "ã»", "ãããã³ã¹", "ã" },
-      new int[] { 0, 2, 3, 5,  9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32 },
-      new int[] { 2, 3, 5, 9, 10, 15, 16, 17, 19, 20, 22, 26, 27, 32, 33 }
+      new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", "ãããã³ã¹",  "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼",  "ãããã³ã¹"  },
+      new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
+      new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
     );
   }
 
@@ -116,9 +118,9 @@ public class TestKuromojiAnalyzer extend
     );
     
     assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã    ")),
-        new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã", " ", " ", " ", " " },
-        new int[] { 0, 2, 3, 4, 5, 6, 8, 9, 10, 11 },
-        new int[] { 2, 3, 4, 5, 6, 8, 9, 10, 11, 12 },
+        new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã"  },
+        new int[] { 0, 2, 3, 4, 5, 6, 8 },
+        new int[] { 2, 3, 4, 5, 6, 8, 9 },
         new Integer(12)
     );
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Tue Jan 10 18:04:53 2012
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -29,9 +28,6 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.zip.ZipFile;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -56,13 +52,12 @@ public class TestQuality extends LuceneT
      word agreement?: 0.999587584716181
      */
     final Segmenter segmenter = new Segmenter();
-    Analyzer testAnalyzer = new KuromojiAnalyzer(segmenter);
     
     String line1 = null;
     String line2 = null;
     while ((line1 = unseg.readLine()) != null) {
       line2 = seg.readLine();
-      evaluateLine(line1, line2, testAnalyzer, stats);
+      evaluateLine(line1, line2, segmenter, stats);
     }
     
     System.out.println("#words: " + stats.numWords);
@@ -84,15 +79,12 @@ public class TestQuality extends LuceneT
     long numSentencesCorrect = 0;
   }
   
-  public static void evaluateLine(String unseg, String seg, Analyzer analyzer, Stats stats) throws Exception {
+  public static void evaluateLine(String unseg, String seg, Segmenter segmenter, Stats stats) throws Exception {
     List<String> tokens = new ArrayList<String>();
-    TokenStream stream = analyzer.tokenStream("bogus", new StringReader(unseg));
-    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
-    stream.reset();
-    while (stream.incrementToken()) {
-      tokens.add(termAtt.toString());
+    List<Token> output = segmenter.tokenize(unseg);
+    for (Token t : output) {
+      tokens.add(t.getSurfaceFormString());
     }
-    stream.close();
     
     List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
     tokens = normalize(tokens);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Tue Jan 10 18:04:53 2012
@@ -69,9 +69,13 @@ public abstract class BinaryDictionaryWr
     
     // build up the POS string
     for (int i = 4; i < 8; i++) {
-      sb.append(CSVUtil.quoteEscape(entry[i]));
-      if (i < 7) {
-        sb.append(',');
+      String part = entry[i];
+      assert part.length() > 0;
+      if (!"*".equals(part)) {
+        if (sb.length() > 0) {
+          sb.append('-');
+        }
+        sb.append(part);
       }
     }
     String pos = sb.toString();

Modified: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Tue Jan 10 18:04:53 2012
@@ -42,7 +42,11 @@ public class KuromojiTokenizerFactory ex
     Mode mode = args.get(MODE) != null ? Mode.valueOf(args.get(MODE).toUpperCase(Locale.ENGLISH)) : Mode.NORMAL;
     String userDictionaryPath = args.get(USER_DICT_PATH);
     try {
-      this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+      if (userDictionaryPath != null) {
+        this.segmenter = new Segmenter(new UserDictionary(userDictionaryPath), mode);
+      } else {
+        this.segmenter = new Segmenter(mode);
+      }
     } catch (Exception e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }