You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by John Caron <ca...@unidata.ucar.edu> on 2002/09/22 06:36:47 UTC

stopwords

i am just starting to use lucene, and it it very impressive! I hope to try 
Dmitri's new term vectors when he gets them in, in order to do vector model 
research, in particular LSA. i will port my existing code to use lucene 
framework, and make it available when it is ready.

I am appending a longer list of stop words, mostly from SMART, in case these are 
useful to anyone.

Thanks again!

private static String smart[] =  {
   "a",
   "able",
   "about",
   "above",
   "according",
   "accordingly",
   "across",
   "actually",
   "after",
   "afterwards",
   "again",
   "against",
   "all",
   "allow",
   "allows",
   "almost",
   "alone",
   "along",
   "already",
   "also",
   "although",
   "always",
   "am",
   "among",
   "amongst",
   "an",
   "and",
   "another",
   "any",
   "anybody",
   "anyhow",
   "anyone",
   "anything",
   "anyway",
   "anyways",
   "anywhere",
   "apart",
   "appear",
   "appreciate",
   "appropriate",
   "are",
   "around",
   "as",
   "aside",
   "ask",
   "asking",
   "associated",
   "at",
   "available",
   "away",
   "awfully",
   "b",
   "be",
   "became",
   "because",
   "become",
   "becomes",
   "becoming",
   "been",
   "before",
   "beforehand",
   "behind",
   "being",
   "believe",
   "below",
   "beside",
   "besides",
   "best",
   "better",
   "between",
   "beyond",
   "both",
   "brief",
   "but",
   "by",
   "c",
   "came",
   "can",
   "cannot",
   "cant",
   "cause",
   "causes",
   "certain",
   "certainly",
   "changes",
   "clearly",
   "co",
   "com",
   "come",
   "comes",
   "concerning",
   "consequently",
   "consider",
   "considering",
   "contain",
   "containing",
   "contains",
   "corresponding",
   "could",
   "course",
   "currently",
   "d",
   "definitely",
   "described",
   "despite",
   "did",
   "different",
   "do",
   "does",
   "doing",
   "done",
   "down",
   "downwards",
   "during",
   "e",
   "each",
   "edu",
   "eg",
   "eight",
   "either",
   "else",
   "elsewhere",
   "enough",
   "entirely",
   "especially",
   "et",
   "etc",
   "even",
   "ever",
   "every",
   "everybody",
   "everyone",
   "everything",
   "everywhere",
   "ex",
   "exactly",
   "example",
   "except",
   "f",
   "far",
   "few",
   "fifth",
   "first",
   "five",
   "followed",
   "following",
   "follows",
   "for",
   "former",
   "formerly",
   "forth",
   "four",
   "from",
   "further",
   "furthermore",
   "g",
   "get",
   "gets",
   "getting",
   "given",
   "gives",
   "go",
   "goes",
   "going",
   "gone",
   "got",
   "gotten",
   "greetings",
   "h",
   "had",
   "happens",
   "hardly",
   "has",
   "have",
   "having",
   "he",
   "hello",
   "help",
   "hence",
   "her",
   "here",
   "hereafter",
   "hereby",
   "herein",
   "hereupon",
   "hers",
   "herself",
   "hi",
   "him",
   "himself",
   "his",
   "hither",
   "hopefully",
   "how",
   "howbeit",
   "however",
   "i",
   "ie",
   "if",
   "ignored",
   "immediate",
   "in",
   "inasmuch",
   "inc",
   "indeed",
   "indicate",
   "indicated",
   "indicates",
   "inner",
   "insofar",
   "instead",
   "into",
   "inward",
   "is",
   "it",
   "its",
   "itself",
   "j",
   "just",
   "k",
   "keep",
   "keeps",
   "kept",
   "know",
   "knows",
   "known",
   "l",
   "last",
   "lately",
   "later",
   "latter",
   "latterly",
   "least",
   "less",
   "lest",
   "let",
   "like",
   "liked",
   "likely",
   "little",
   "look",
   "looking",
   "looks",
   "ltd",
   "m",
   "mainly",
   "many",
   "may",
   "maybe",
   "me",
   "mean",
   "meanwhile",
   "merely",
   "might",
   "more",
   "moreover",
   "most",
   "mostly",
   "much",
   "must",
   "my",
   "myself",
   "n",
   "name",
   "namely",
   "nd",
   "near",
   "nearly",
   "necessary",
   "need",
   "needs",
   "neither",
   "never",
   "nevertheless",
   "new",
   "next",
   "nine",
   "no",
   "nobody",
   "non",
   "none",
   "noone",
   "nor",
   "normally",
   "not",
   "nothing",
   "novel",
   "now",
   "nowhere",
   "o",
   "obviously",
   "of",
   "off",
   "often",
   "oh",
   "ok",
   "okay",
   "old",
   "on",
   "once",
   "one",
   "ones",
   "only",
   "onto",
   "or",
   "other",
   "others",
   "otherwise",
   "ought",
   "our",
   "ours",
   "ourselves",
   "out",
   "outside",
   "over",
   "overall",
   "own",
   "p",
   "particular",
   "particularly",
   "per",
   "perhaps",
   "placed",
   "please",
   "plus",
   "possible",
   "presumably",
   "probably",
   "provides",
   "q",
   "que",
   "quite",
   "qv",
   "r",
   "rather",
   "rd",
   "re",
   "really",
   "reasonably",
   "regarding",
   "regardless",
   "regards",
   "relatively",
   "respectively",
   "right",
   "s",
   "said",
   "same",
   "saw",
   "say",
   "saying",
   "says",
   "second",
   "secondly",
   "see",
   "seeing",
   "seem",
   "seemed",
   "seeming",
   "seems",
   "seen",
   "self",
   "selves",
   "sensible",
   "sent",
   "serious",
   "seriously",
   "seven",
   "several",
   "shall",
   "she",
   "should",
   "since",
   "six",
   "so",
   "some",
   "somebody",
   "somehow",
   "someone",
   "something",
   "sometime",
   "sometimes",
   "somewhat",
   "somewhere",
   "soon",
   "sorry",
   "specified",
   "specify",
   "specifying",
   "still",
   "sub",
   "such",
   "sup",
   "sure",
   "t",
   "take",
   "taken",
   "tell",
   "tends",
   "th",
   "than",
   "thank",
   "thanks",
   "thanx",
   "that",
   "thats",
   "the",
   "their",
   "theirs",
   "them",
   "themselves",
   "then",
   "thence",
   "there",
   "thereafter",
   "thereby",
   "therefore",
   "therein",
   "theres",
   "thereupon",
   "these",
   "they",
   "think",
   "third",
   "this",
   "thorough",
   "thoroughly",
   "those",
   "though",
   "three",
   "through",
   "throughout",
   "thru",
   "thus",
   "to",
   "together",
   "too",
   "took",
   "toward",
   "towards",
   "tried",
   "tries",
   "truly",
   "try",
   "trying",
   "twice",
   "two",
   "u",
   "un",
   "under",
   "unfortunately",
   "unless",
   "unlikely",
   "until",
   "unto",
   "up",
   "upon",
   "us",
   "use",
   "used",
   "useful",
   "uses",
   "using",
   "usually",
   "uucp",
   "v",
   "value",
   "various",
   "very",
   "via",
   "viz",
   "vs",
   "w",
   "want",
   "wants",
   "was",
   "way",
   "we",
   "welcome",
   "well",
   "went",
   "were",
   "what",
   "whatever",
   "when",
   "whence",
   "whenever",
   "where",
   "whereafter",
   "whereas",
   "whereby",
   "wherein",
   "whereupon",
   "wherever",
   "whether",
   "which",
   "while",
   "whither",
   "who",
   "whoever",
   "whole",
   "whom",
   "whose",
   "why",
   "will",
   "willing",
   "wish",
   "with",
   "within",
   "without",
   "wonder",
   "would",
   "would",
   "x",
   "y",
   "yes",
   "yet",
   "you",
   "your",
   "yours",
   "yourself",
   "yourselves",
   "z",
   "zero"
   };


--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>


Re: stopwords

Posted by Otis Gospodnetic <ot...@yahoo.com>.
Thanks.  I may stick this in the Lucene CVS repository somewhere.

Otis

--- John Caron <ca...@unidata.ucar.edu> wrote:
> i am just starting to use lucene, and it it very impressive! I hope
> to try 
> Dmitri's new term vectors when he gets them in, in order to do vector
> model 
> research, in particular LSA. i will port my existing code to use
> lucene 
> framework, and make it available when it is ready.
> 
> I am appending a longer list of stop words, mostly from SMART, in
> case these are 
> useful to anyone.
> 
> Thanks again!
> 
> private static String smart[] =  {
>    "a",
>    "able",
>    "about",
>    "above",
>    "according",
>    "accordingly",
>    "across",
>    "actually",
>    "after",
>    "afterwards",
>    "again",
>    "against",
>    "all",
>    "allow",
>    "allows",
>    "almost",
>    "alone",
>    "along",
>    "already",
>    "also",
>    "although",
>    "always",
>    "am",
>    "among",
>    "amongst",
>    "an",
>    "and",
>    "another",
>    "any",
>    "anybody",
>    "anyhow",
>    "anyone",
>    "anything",
>    "anyway",
>    "anyways",
>    "anywhere",
>    "apart",
>    "appear",
>    "appreciate",
>    "appropriate",
>    "are",
>    "around",
>    "as",
>    "aside",
>    "ask",
>    "asking",
>    "associated",
>    "at",
>    "available",
>    "away",
>    "awfully",
>    "b",
>    "be",
>    "became",
>    "because",
>    "become",
>    "becomes",
>    "becoming",
>    "been",
>    "before",
>    "beforehand",
>    "behind",
>    "being",
>    "believe",
>    "below",
>    "beside",
>    "besides",
>    "best",
>    "better",
>    "between",
>    "beyond",
>    "both",
>    "brief",
>    "but",
>    "by",
>    "c",
>    "came",
>    "can",
>    "cannot",
>    "cant",
>    "cause",
>    "causes",
>    "certain",
>    "certainly",
>    "changes",
>    "clearly",
>    "co",
>    "com",
>    "come",
>    "comes",
>    "concerning",
>    "consequently",
>    "consider",
>    "considering",
>    "contain",
>    "containing",
>    "contains",
>    "corresponding",
>    "could",
>    "course",
>    "currently",
>    "d",
>    "definitely",
>    "described",
>    "despite",
>    "did",
>    "different",
>    "do",
>    "does",
>    "doing",
>    "done",
>    "down",
>    "downwards",
>    "during",
>    "e",
>    "each",
>    "edu",
>    "eg",
>    "eight",
>    "either",
>    "else",
>    "elsewhere",
>    "enough",
>    "entirely",
>    "especially",
>    "et",
>    "etc",
>    "even",
>    "ever",
>    "every",
>    "everybody",
>    "everyone",
>    "everything",
>    "everywhere",
>    "ex",
>    "exactly",
>    "example",
>    "except",
>    "f",
>    "far",
>    "few",
>    "fifth",
>    "first",
>    "five",
>    "followed",
>    "following",
>    "follows",
>    "for",
>    "former",
>    "formerly",
>    "forth",
>    "four",
>    "from",
>    "further",
>    "furthermore",
>    "g",
>    "get",
>    "gets",
>    "getting",
>    "given",
>    "gives",
>    "go",
>    "goes",
>    "going",
>    "gone",
>    "got",
>    "gotten",
>    "greetings",
>    "h",
>    "had",
>    "happens",
>    "hardly",
>    "has",
>    "have",
>    "having",
>    "he",
>    "hello",
>    "help",
>    "hence",
>    "her",
>    "here",
>    "hereafter",
>    "hereby",
>    "herein",
>    "hereupon",
>    "hers",
>    "herself",
>    "hi",
>    "him",
>    "himself",
>    "his",
>    "hither",
>    "hopefully",
>    "how",
>    "howbeit",
>    "however",
>    "i",
>    "ie",
>    "if",
>    "ignored",
>    "immediate",
>    "in",
>    "inasmuch",
>    "inc",
>    "indeed",
>    "indicate",
>    "indicated",
>    "indicates",
>    "inner",
>    "insofar",
>    "instead",
>    "into",
>    "inward",
>    "is",
>    "it",
>    "its",
>    "itself",
>    "j",
>    "just",
>    "k",
>    "keep",
>    "keeps",
>    "kept",
>    "know",
>    "knows",
>    "known",
>    "l",
>    "last",
>    "lately",
>    "later",
>    "latter",
>    "latterly",
>    "least",
>    "less",
>    "lest",
>    "let",
>    "like",
>    "liked",
>    "likely",
>    "little",
>    "look",
>    "looking",
>    "looks",
>    "ltd",
>    "m",
>    "mainly",
>    "many",
>    "may",
>    "maybe",
>    "me",
>    "mean",
>    "meanwhile",
>    "merely",
>    "might",
>    "more",
>    "moreover",
>    "most",
>    "mostly",
>    "much",
>    "must",
>    "my",
>    "myself",
>    "n",
>    "name",
>    "namely",
>    "nd",
>    "near",
>    "nearly",
>    "necessary",
>    "need",
>    "needs",
>    "neither",
>    "never",
>    "nevertheless",
>    "new",
>    "next",
>    "nine",
>    "no",
>    "nobody",
>    "non",
>    "none",
>    "noone",
>    "nor",
>    "normally",
>    "not",
>    "nothing",
>    "novel",
>    "now",
>    "nowhere",
>    "o",
>    "obviously",
>    "of",
>    "off",
>    "often",
>    "oh",
>    "ok",
>    "okay",
>    "old",
>    "on",
>    "once",
>    "one",
>    "ones",
>    "only",
>    "onto",
>    "or",
>    "other",
>    "others",
>    "otherwise",
>    "ought",
>    "our",
>    "ours",
>    "ourselves",
>    "out",
>    "outside",
>    "over",
>    "overall",
>    "own",
>    "p",
>    "particular",
>    "particularly",
>    "per",
>    "perhaps",
>    "placed",
>    "please",
>    "plus",
>    "possible",
>    "presumably",
>    "probably",
>    "provides",
>    "q",
>    "que",
>    "quite",
>    "qv",
>    "r",
>    "rather",
>    "rd",
>    "re",
>    "really",
>    "reasonably",
>    "regarding",
>    "regardless",
>    "regards",
>    "relatively",
>    "respectively",
>    "right",
>    "s",
>    "said",
>    "same",
>    "saw",
>    "say",
>    "saying",
>    "says",
>    "second",
>    "secondly",
>    "see",
>    "seeing",
>    "seem",
>    "seemed",
>    "seeming",
>    "seems",
>    "seen",
>    "self",
>    "selves",
>    "sensible",
>    "sent",
>    "serious",
>    "seriously",
>    "seven",
>    "several",
>    "shall",
>    "she",
>    "should",
>    "since",
>    "six",
>    "so",
>    "some",
>    "somebody",
>    "somehow",
>    "someone",
>    "something",
>    "sometime",
>    "sometimes",
>    "somewhat",
>    "somewhere",
>    "soon",
>    "sorry",
>    "specified",
>    "specify",
>    "specifying",
>    "still",
>    "sub",
>    "such",
>    "sup",
>    "sure",
>    "t",
>    "take",
>    "taken",
>    "tell",
>    "tends",
>    "th",
>    "than",
>    "thank",
>    "thanks",
>    "thanx",
>    "that",
>    "thats",
>    "the",
>    "their",
>    "theirs",
>    "them",
>    "themselves",
>    "then",
>    "thence",
>    "there",
>    "thereafter",
>    "thereby",
>    "therefore",
>    "therein",
>    "theres",
>    "thereupon",
>    "these",
>    "they",
>    "think",
>    "third",
>    "this",
>    "thorough",
>    "thoroughly",
>    "those",
>    "though",
>    "three",
>    "through",
>    "throughout",
>    "thru",
>    "thus",
>    "to",
>    "together",
>    "too",
>    "took",
>    "toward",
>    "towards",
>    "tried",
>    "tries",
>    "truly",
>    "try",
>    "trying",
>    "twice",
>    "two",
>    "u",
>    "un",
>    "under",
>    "unfortunately",
>    "unless",
>    "unlikely",
>    "until",
>    "unto",
>    "up",
>    "upon",
>    "us",
>    "use",
>    "used",
>    "useful",
>    "uses",
>    "using",
>    "usually",
>    "uucp",
>    "v",
>    "value",
>    "various",
>    "very",
>    "via",
>    "viz",
>    "vs",
>    "w",
>    "want",
>    "wants",
>    "was",
>    "way",
>    "we",
>    "welcome",
>    "well",
>    "went",
>    "were",
>    "what",
>    "whatever",
>    "when",
>    "whence",
>    "whenever",
>    "where",
>    "whereafter",
>    "whereas",
>    "whereby",
>    "wherein",
>    "whereupon",
>    "wherever",
>    "whether",
>    "which",
>    "while",
>    "whither",
>    "who",
>    "whoever",
>    "whole",
>    "whom",
>    "whose",
>    "why",
>    "will",
>    "willing",
>    "wish",
>    "with",
>    "within",
>    "without",
>    "wonder",
>    "would",
>    "would",
>    "x",
>    "y",
>    "yes",
>    "yet",
>    "you",
>    "your",
>    "yours",
>    "yourself",
>    "yourselves",
>    "z",
>    "zero"
>    };
> 
> 
> --
> To unsubscribe, e-mail:  
> <ma...@jakarta.apache.org>
> For additional commands, e-mail:
> <ma...@jakarta.apache.org>
> 


__________________________________________________
Do you Yahoo!?
Faith Hill - Exclusive Performances, Videos & More
http://faith.yahoo.com

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>