You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/05 14:05:42 UTC
svn commit: r1240714 - in
/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji:
stoptags.txt stopwords.txt
Author: rmuir
Date: Sun Feb 5 13:05:42 2012
New Revision: 1240714
URL: http://svn.apache.org/viewvc?rev=1240714&view=rev
Log:
LUCENE-3745: add proper Japanese stopping
Modified:
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt?rev=1240714&r1=1240713&r2=1240714&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt Sun Feb 5 13:05:42 2012
@@ -1,6 +1,14 @@
-# set of default stop tags:
-# uncomment a part of speech to treat those words as stopwords.
-# the entire tagset is provided here for convenience.
+#
+# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+#
+# Any token with a part-of-speech tag that exactly matches those defined in this
+# file are removed from the token stream.
+#
+# Set your own stoptags by uncommenting the lines below. Note that comments are
+# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
+# etc. that can be useful for building you own stoptag set.
+#
+# The entire possible tagset is provided below for convenience.
#
#####
# noun: unclassified nouns
@@ -188,25 +196,25 @@
#
#####
# prefix: unclassified prefixes
-æ¥é è©
+#æ¥é è©
#
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
# excluding numerical expressions.
# e.g. ã (æ°´), æ (æ°), å (社), æ
(ï½æ°), é« (å質), ã (è¦äº), ã (ç«æ´¾)
-æ¥é è©-åè©æ¥ç¶
+#æ¥é è©-åè©æ¥ç¶
#
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
# in conjunctive form followed by ãªã/ãªãã/ãã ãã.
# e.g. ã (èªã¿ãªãã), ã (座ã)
-æ¥é è©-åè©æ¥ç¶
+#æ¥é è©-åè©æ¥ç¶
#
# prefix-adjectival: Prefixes that attach to adjectives.
# e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
-æ¥é è©-形容è©æ¥ç¶
+#æ¥é è©-形容è©æ¥ç¶
#
# prefix-numerical: Prefixes that attach to numerical expressions.
# e.g. ç´, ããã, æ¯æ
-æ¥é è©-æ°æ¥ç¶
+#æ¥é è©-æ°æ¥ç¶
#
#####
# verb: unclassified verbs
@@ -216,7 +224,7 @@
#åè©-èªç«
#
# verb-auxiliary:
-åè©-éèªç«
+#åè©-éèªç«
#
# verb-suffix:
#åè©-æ¥å°¾
@@ -351,11 +359,11 @@
# interjection: Greetings and other exclamations.
# e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã,
# ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã
-æåè©
+#æåè©
#
#####
# symbol: unclassified Symbols.
-#è¨å·
+è¨å·
#
# symbol-misc: A general symbol not in one of the categories below.
# e.g. [ââ@$ãâ+]
@@ -408,3 +416,5 @@
#####
# unknown: unknown part of speech.
#æªç¥èª
+#
+##### End of file
\ No newline at end of file
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt?rev=1240714&r1=1240713&r2=1240714&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt Sun Feb 5 13:05:42 2012
@@ -1,13 +1,122 @@
-# short set of japanese stopwords
-ãã
+#
+# This file defines a stopword set for Japanese.
+#
+# The set is made up hand-picked frequent terms from taken from segmented Japanese
+# Wikipedia. Punctuation characters and frequent kanji have mostly been left out.
+#
+# There is an overlap between these stopwords and the terms removed when used in
+# combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
+# that comments are not allowed on the same line as stopwords.
+#
+# See LUCENE-3745 for frequency lists, etc. that can be useful for making your own set.
+#
+ã®
+ã«
+ã¯
+ã
+ã
+ã
+ã§
+ã¦
+ã¨
+ã
+ã
+ã
+ãã
+ãã
+ã
ãã
-人ç©
-ãã¾
-ãããã¨
+ãã
+ãª
+ãã¨
+ã¨ãã¦
+ã
+ã
+ãã
+ãªã©
+ãªã£
+ãªã
+ãã®
ãã
+ãã®
+ãã£
+ãã
+ã¾ã
ãã®
-ããã¦
+ã¨ãã
+ãã
+ã¾ã§
+ãã
ãªã
+ã¸
+ã
+ã
+ãã
+ã«ãã£ã¦
+ã«ãã
+ãã
+ãã
+ã«ãã
+ã
+ãªã
+ããã
+ã«ããã¦
+ã°
+ãªãã£
+ãªã
+ããã
+ã«ã¤ãã¦
+ã
+ã ã£
+ãã®å¾
ã§ãã
-ãã
-ãã
+ãã
+ã
+ã®ã§
+ãªã
+ã®ã¿
+ã§ã
+ã
+ã¤
+ã«ããã
+ããã³
+ãã
+ããã«
+ã§ã
+ã
+ãã
+ãã®ä»
+ã«é¢ãã
+ãã¡
+ã¾ã
+ã
+ãªã
+ã«å¯¾ãã¦
+ç¹ã«
+ãã
+åã³
+ããã
+ã¨ã
+ã§ã¯
+ã«ã¦
+ã»ã
+ãªãã
+ãã¡
+ããã¦
+ã¨ã¨ãã«
+ãã ã
+ãã¤ã¦
+ãããã
+ã¾ãã¯
+ã
+ã»ã©
+ãã®ã®
+ã«å¯¾ãã
+ã»ã¨ãã©
+ã¨å
±ã«
+ã¨ãã£ã
+ã§ã
+ã¨ã
+ã¨ãã
+ãã
+##### End of file