You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ot...@apache.org on 2006/06/12 07:46:17 UTC
svn commit: r413584 [2/3] - in /lucene/java/trunk/contrib/memory/src:
java/org/apache/lucene/index/memory/ test/org/apache/lucene/index/memory/
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java Sun Jun 11 22:46:16 2006
@@ -63,397 +63,397 @@
* @author whoschek.AT.lbl.DOT.gov
*/
public class PatternAnalyzer extends Analyzer {
-
- /** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
- public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
-
- /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
- public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
-
- private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
- "a", "about", "above", "across", "adj", "after", "afterwards",
- "again", "against", "albeit", "all", "almost", "alone", "along",
- "already", "also", "although", "always", "among", "amongst", "an",
- "and", "another", "any", "anyhow", "anyone", "anything",
- "anywhere", "are", "around", "as", "at", "be", "became", "because",
- "become", "becomes", "becoming", "been", "before", "beforehand",
- "behind", "being", "below", "beside", "besides", "between",
- "beyond", "both", "but", "by", "can", "cannot", "co", "could",
- "down", "during", "each", "eg", "either", "else", "elsewhere",
- "enough", "etc", "even", "ever", "every", "everyone", "everything",
- "everywhere", "except", "few", "first", "for", "former",
- "formerly", "from", "further", "had", "has", "have", "he", "hence",
- "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
- "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
- "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
- "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
- "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
- "must", "my", "myself", "namely", "neither", "never",
- "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
- "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
- "once one", "only", "onto", "or", "other", "others", "otherwise",
- "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
- "rather", "s", "same", "seem", "seemed", "seeming", "seems",
- "several", "she", "should", "since", "so", "some", "somehow",
- "someone", "something", "sometime", "sometimes", "somewhere",
- "still", "such", "t", "than", "that", "the", "their", "them",
- "themselves", "then", "thence", "there", "thereafter", "thereby",
- "therefor", "therein", "thereupon", "these", "they", "this",
- "those", "though", "through", "throughout", "thru", "thus", "to",
- "together", "too", "toward", "towards", "under", "until", "up",
- "upon", "us", "very", "via", "was", "we", "well", "were", "what",
- "whatever", "whatsoever", "when", "whence", "whenever",
- "whensoever", "where", "whereafter", "whereas", "whereat",
- "whereby", "wherefrom", "wherein", "whereinto", "whereof",
- "whereon", "whereto", "whereunto", "whereupon", "wherever",
- "wherewith", "whether", "which", "whichever", "whichsoever",
- "while", "whilst", "whither", "who", "whoever", "whole", "whom",
- "whomever", "whomsoever", "whose", "whosoever", "why", "will",
- "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
- "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
- "yourselves"});
-
- /**
- * A lower-casing word analyzer with English stop words (can be shared
- * freely across threads without harm); global per class loader.
- */
- public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
- NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
-
- /**
- * A lower-casing word analyzer with <b>extended </b> English stop words
- * (can be shared freely across threads without harm); global per class
- * loader. The stop words are borrowed from
- * http://thomas.loc.gov/home/stopwords.html, see
- * http://thomas.loc.gov/home/all.about.inquery.html
- */
- public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
- NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
-
- private final Pattern pattern;
- private final boolean toLowerCase;
- private final Set stopWords;
-
- /**
- * Constructs a new instance with the given parameters.
- *
- * @param pattern
- * a regular expression delimiting tokens
- * @param toLowerCase
- * if <code>true</code> returns tokens after applying
- * String.toLowerCase()
- * @param stopWords
- * if non-null, ignores all tokens that are contained in the
- * given stop set (after previously having applied toLowerCase()
- * if applicable). For example, created via
- * {@link StopFilter#makeStopSet(String[])}and/or
- * {@link org.apache.lucene.analysis.WordlistLoader}as in
- * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
- * or <a href="http://www.unine.ch/info/clef/">other stop words
- * lists </a>.
- */
- public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
- if (pattern == null)
- throw new IllegalArgumentException("pattern must not be null");
-
- if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
- else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
-
- if (stopWords != null && stopWords.size() == 0) stopWords = null;
-
- this.pattern = pattern;
- this.toLowerCase = toLowerCase;
- this.stopWords = stopWords;
- }
-
- /**
- * Creates a token stream that tokenizes the given string into token terms
- * (aka words).
- *
- * @param fieldName
- * the name of the field to tokenize (currently ignored).
- * @param text
- * the string to tokenize
- * @return a new token stream
- */
- public TokenStream tokenStream(String fieldName, String text) {
- // Ideally the Analyzer superclass should have a method with the same signature,
- // with a default impl that simply delegates to the StringReader flavour.
- if (text == null)
- throw new IllegalArgumentException("text must not be null");
-
- TokenStream stream;
- if (pattern == NON_WORD_PATTERN) { // fast path
- stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
- }
- else if (pattern == WHITESPACE_PATTERN) { // fast path
- stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
- }
- else {
- stream = new PatternTokenizer(text, pattern, toLowerCase);
- if (stopWords != null) stream = new StopFilter(stream, stopWords);
- }
-
- return stream;
- }
-
- /**
- * Creates a token stream that tokenizes all the text in the given Reader;
- * This implementation forwards to <code>tokenStream(String, String)</code> and is
- * less efficient than <code>tokenStream(String, String)</code>.
- *
- * @param fieldName
- * the name of the field to tokenize (currently ignored).
- * @param reader
- * the reader delivering the text
- * @return a new token stream
- */
- public TokenStream tokenStream(String fieldName, Reader reader) {
- if (reader instanceof FastStringReader) { // fast path
- return tokenStream(fieldName, ((FastStringReader)reader).getString());
- }
-
- try {
- String text = toString(reader);
- return tokenStream(fieldName, text);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Indicates whether some other object is "equal to" this one.
- *
- * @param other
- * the reference object with which to compare.
- * @return true if equal, false otherwise
- */
- public boolean equals(Object other) {
- if (this == other) return true;
- if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
- if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
-
- if (other instanceof PatternAnalyzer) {
- PatternAnalyzer p2 = (PatternAnalyzer) other;
- return
- toLowerCase == p2.toLowerCase &&
- eqPattern(pattern, p2.pattern) &&
- eq(stopWords, p2.stopWords);
- }
- return false;
- }
-
- /**
- * Returns a hash code value for the object.
- *
- * @return the hash code.
- */
- public int hashCode() {
- if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
- if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
-
- int h = 1;
- h = 31*h + pattern.pattern().hashCode();
- h = 31*h + pattern.flags();
- h = 31*h + (toLowerCase ? 1231 : 1237);
- h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
- return h;
- }
-
- /** equality where o1 and/or o2 can be null */
- private static boolean eq(Object o1, Object o2) {
- return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
- }
-
- /** assumes p1 and p2 are not null */
- private static boolean eqPattern(Pattern p1, Pattern p2) {
- return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
- }
-
- /**
- * Reads until end-of-stream and returns all read chars, finally closes the stream.
- *
- * @param input the input stream
- * @throws IOException if an I/O error occurs while reading the stream
- */
- private static String toString(Reader input) throws IOException {
- try {
- int len = 256;
- char[] buffer = new char[len];
- char[] output = new char[len];
-
- len = 0;
- int n;
- while ((n = input.read(buffer)) >= 0) {
- if (len + n > output.length) { // grow capacity
- char[] tmp = new char[Math.max(output.length << 1, len + n)];
- System.arraycopy(output, 0, tmp, 0, len);
- System.arraycopy(buffer, 0, tmp, len, n);
- buffer = output; // use larger buffer for future larger bulk reads
- output = tmp;
- } else {
- System.arraycopy(buffer, 0, output, len, n);
- }
- len += n;
- }
+
+ /** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
+ public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
+
+ /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
+ public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+
+ private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
+ "a", "about", "above", "across", "adj", "after", "afterwards",
+ "again", "against", "albeit", "all", "almost", "alone", "along",
+ "already", "also", "although", "always", "among", "amongst", "an",
+ "and", "another", "any", "anyhow", "anyone", "anything",
+ "anywhere", "are", "around", "as", "at", "be", "became", "because",
+ "become", "becomes", "becoming", "been", "before", "beforehand",
+ "behind", "being", "below", "beside", "besides", "between",
+ "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+ "down", "during", "each", "eg", "either", "else", "elsewhere",
+ "enough", "etc", "even", "ever", "every", "everyone", "everything",
+ "everywhere", "except", "few", "first", "for", "former",
+ "formerly", "from", "further", "had", "has", "have", "he", "hence",
+ "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+ "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+ "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+ "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+ "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+ "must", "my", "myself", "namely", "neither", "never",
+ "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+ "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+ "once one", "only", "onto", "or", "other", "others", "otherwise",
+ "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+ "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+ "several", "she", "should", "since", "so", "some", "somehow",
+ "someone", "something", "sometime", "sometimes", "somewhere",
+ "still", "such", "t", "than", "that", "the", "their", "them",
+ "themselves", "then", "thence", "there", "thereafter", "thereby",
+ "therefor", "therein", "thereupon", "these", "they", "this",
+ "those", "though", "through", "throughout", "thru", "thus", "to",
+ "together", "too", "toward", "towards", "under", "until", "up",
+ "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+ "whatever", "whatsoever", "when", "whence", "whenever",
+ "whensoever", "where", "whereafter", "whereas", "whereat",
+ "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+ "whereon", "whereto", "whereunto", "whereupon", "wherever",
+ "wherewith", "whether", "which", "whichever", "whichsoever",
+ "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+ "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+ "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+ "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+ "yourselves"});
+
+ /**
+ * A lower-casing word analyzer with English stop words (can be shared
+ * freely across threads without harm); global per class loader.
+ */
+ public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
+ NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
+
+ /**
+ * A lower-casing word analyzer with <b>extended </b> English stop words
+ * (can be shared freely across threads without harm); global per class
+ * loader. The stop words are borrowed from
+ * http://thomas.loc.gov/home/stopwords.html, see
+ * http://thomas.loc.gov/home/all.about.inquery.html
+ */
+ public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
+ NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
+
+ private final Pattern pattern;
+ private final boolean toLowerCase;
+ private final Set stopWords;
+
+ /**
+ * Constructs a new instance with the given parameters.
+ *
+ * @param pattern
+ * a regular expression delimiting tokens
+ * @param toLowerCase
+ * if <code>true</code> returns tokens after applying
+ * String.toLowerCase()
+ * @param stopWords
+ * if non-null, ignores all tokens that are contained in the
+ * given stop set (after previously having applied toLowerCase()
+ * if applicable). For example, created via
+ * {@link StopFilter#makeStopSet(String[])}and/or
+ * {@link org.apache.lucene.analysis.WordlistLoader}as in
+ * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
+ * or <a href="http://www.unine.ch/info/clef/">other stop words
+ * lists </a>.
+ */
+ public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
+ if (pattern == null)
+ throw new IllegalArgumentException("pattern must not be null");
+
+ if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
+ else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
+
+ if (stopWords != null && stopWords.size() == 0) stopWords = null;
+
+ this.pattern = pattern;
+ this.toLowerCase = toLowerCase;
+ this.stopWords = stopWords;
+ }
+
+ /**
+ * Creates a token stream that tokenizes the given string into token terms
+ * (aka words).
+ *
+ * @param fieldName
+ * the name of the field to tokenize (currently ignored).
+ * @param text
+ * the string to tokenize
+ * @return a new token stream
+ */
+ public TokenStream tokenStream(String fieldName, String text) {
+ // Ideally the Analyzer superclass should have a method with the same signature,
+ // with a default impl that simply delegates to the StringReader flavour.
+ if (text == null)
+ throw new IllegalArgumentException("text must not be null");
+
+ TokenStream stream;
+ if (pattern == NON_WORD_PATTERN) { // fast path
+ stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
+ }
+ else if (pattern == WHITESPACE_PATTERN) { // fast path
+ stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
+ }
+ else {
+ stream = new PatternTokenizer(text, pattern, toLowerCase);
+ if (stopWords != null) stream = new StopFilter(stream, stopWords);
+ }
+
+ return stream;
+ }
+
+ /**
+ * Creates a token stream that tokenizes all the text in the given Reader;
+ * This implementation forwards to <code>tokenStream(String, String)</code> and is
+ * less efficient than <code>tokenStream(String, String)</code>.
+ *
+ * @param fieldName
+ * the name of the field to tokenize (currently ignored).
+ * @param reader
+ * the reader delivering the text
+ * @return a new token stream
+ */
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ if (reader instanceof FastStringReader) { // fast path
+ return tokenStream(fieldName, ((FastStringReader)reader).getString());
+ }
+
+ try {
+ String text = toString(reader);
+ return tokenStream(fieldName, text);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Indicates whether some other object is "equal to" this one.
+ *
+ * @param other
+ * the reference object with which to compare.
+ * @return true if equal, false otherwise
+ */
+ public boolean equals(Object other) {
+ if (this == other) return true;
+ if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
+ if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
+
+ if (other instanceof PatternAnalyzer) {
+ PatternAnalyzer p2 = (PatternAnalyzer) other;
+ return
+ toLowerCase == p2.toLowerCase &&
+ eqPattern(pattern, p2.pattern) &&
+ eq(stopWords, p2.stopWords);
+ }
+ return false;
+ }
+
+ /**
+ * Returns a hash code value for the object.
+ *
+ * @return the hash code.
+ */
+ public int hashCode() {
+ if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
+ if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
+
+ int h = 1;
+ h = 31*h + pattern.pattern().hashCode();
+ h = 31*h + pattern.flags();
+ h = 31*h + (toLowerCase ? 1231 : 1237);
+ h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
+ return h;
+ }
+
+ /** equality where o1 and/or o2 can be null */
+ private static boolean eq(Object o1, Object o2) {
+ return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
+ }
+
+ /** assumes p1 and p2 are not null */
+ private static boolean eqPattern(Pattern p1, Pattern p2) {
+ return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
+ }
+
+ /**
+ * Reads until end-of-stream and returns all read chars, finally closes the stream.
+ *
+ * @param input the input stream
+ * @throws IOException if an I/O error occurs while reading the stream
+ */
+ private static String toString(Reader input) throws IOException {
+ try {
+ int len = 256;
+ char[] buffer = new char[len];
+ char[] output = new char[len];
+
+ len = 0;
+ int n;
+ while ((n = input.read(buffer)) >= 0) {
+ if (len + n > output.length) { // grow capacity
+ char[] tmp = new char[Math.max(output.length << 1, len + n)];
+ System.arraycopy(output, 0, tmp, 0, len);
+ System.arraycopy(buffer, 0, tmp, len, n);
+ buffer = output; // use larger buffer for future larger bulk reads
+ output = tmp;
+ } else {
+ System.arraycopy(buffer, 0, output, len, n);
+ }
+ len += n;
+ }
- return new String(output, 0, output.length);
- } finally {
- if (input != null) input.close();
- }
- }
-
- /** somewhat oversized to minimize hash collisions */
- private static Set makeStopSet(String[] stopWords) {
- Set stops = new HashSet(stopWords.length * 2, 0.3f);
- stops.addAll(Arrays.asList(stopWords));
- return stops;
-// return Collections.unmodifiableSet(stops);
- }
+ return new String(output, 0, output.length);
+ } finally {
+ if (input != null) input.close();
+ }
+ }
+
+ /** somewhat oversized to minimize hash collisions */
+ private static Set makeStopSet(String[] stopWords) {
+ Set stops = new HashSet(stopWords.length * 2, 0.3f);
+ stops.addAll(Arrays.asList(stopWords));
+ return stops;
+// return Collections.unmodifiableSet(stops);
+ }
-
- ///////////////////////////////////////////////////////////////////////////////
- // Nested classes:
- ///////////////////////////////////////////////////////////////////////////////
- /**
- * The work horse; performance isn't fantastic, but it's not nearly as bad
- * as one might think - kudos to the Sun regex developers.
- */
- private static final class PatternTokenizer extends TokenStream {
-
- private final String str;
- private final boolean toLowerCase;
- private Matcher matcher;
- private int pos = 0;
- private static final Locale locale = Locale.getDefault();
-
- public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
- this.str = str;
- this.matcher = pattern.matcher(str);
- this.toLowerCase = toLowerCase;
- }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * The work horse; performance isn't fantastic, but it's not nearly as bad
+ * as one might think - kudos to the Sun regex developers.
+ */
+ private static final class PatternTokenizer extends TokenStream {
+
+ private final String str;
+ private final boolean toLowerCase;
+ private Matcher matcher;
+ private int pos = 0;
+ private static final Locale locale = Locale.getDefault();
+
+ public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
+ this.str = str;
+ this.matcher = pattern.matcher(str);
+ this.toLowerCase = toLowerCase;
+ }
- public Token next() {
- if (matcher == null) return null;
-
- while (true) { // loop takes care of leading and trailing boundary cases
- int start = pos;
- int end;
- boolean isMatch = matcher.find();
- if (isMatch) {
- end = matcher.start();
- pos = matcher.end();
- } else {
- end = str.length();
- matcher = null; // we're finished
- }
-
- if (start != end) { // non-empty match (header/trailer)
- String text = str.substring(start, end);
- if (toLowerCase) text = text.toLowerCase(locale);
- return new Token(text, start, end);
- }
- if (!isMatch) return null;
- }
- }
-
- }
-
-
- ///////////////////////////////////////////////////////////////////////////////
- // Nested classes:
- ///////////////////////////////////////////////////////////////////////////////
- /**
- * Special-case class for best performance in common cases; this class is
- * otherwise unnecessary.
- */
- private static final class FastStringTokenizer extends TokenStream {
-
- private final String str;
- private int pos;
- private final boolean isLetter;
- private final boolean toLowerCase;
- private final Set stopWords;
- private static final Locale locale = Locale.getDefault();
-
- public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
- this.str = str;
- this.isLetter = isLetter;
- this.toLowerCase = toLowerCase;
- this.stopWords = stopWords;
- }
+ public Token next() {
+ if (matcher == null) return null;
+
+ while (true) { // loop takes care of leading and trailing boundary cases
+ int start = pos;
+ int end;
+ boolean isMatch = matcher.find();
+ if (isMatch) {
+ end = matcher.start();
+ pos = matcher.end();
+ } else {
+ end = str.length();
+ matcher = null; // we're finished
+ }
+
+ if (start != end) { // non-empty match (header/trailer)
+ String text = str.substring(start, end);
+ if (toLowerCase) text = text.toLowerCase(locale);
+ return new Token(text, start, end);
+ }
+ if (!isMatch) return null;
+ }
+ }
+
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * Special-case class for best performance in common cases; this class is
+ * otherwise unnecessary.
+ */
+ private static final class FastStringTokenizer extends TokenStream {
+
+ private final String str;
+ private int pos;
+ private final boolean isLetter;
+ private final boolean toLowerCase;
+ private final Set stopWords;
+ private static final Locale locale = Locale.getDefault();
+
+ public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
+ this.str = str;
+ this.isLetter = isLetter;
+ this.toLowerCase = toLowerCase;
+ this.stopWords = stopWords;
+ }
- public Token next() {
- // cache loop instance vars (performance)
- String s = str;
- int len = s.length();
- int i = pos;
- boolean letter = isLetter;
-
- int start = 0;
- String text;
- do {
- // find beginning of token
- text = null;
- while (i < len && !isTokenChar(s.charAt(i), letter)) {
- i++;
- }
-
- if (i < len) { // found beginning; now find end of token
- start = i;
- while (i < len && isTokenChar(s.charAt(i), letter)) {
- i++;
- }
-
- text = s.substring(start, i);
- if (toLowerCase) text = text.toLowerCase(locale);
-// if (toLowerCase) {
-//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
-//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
-// text = s.substring(start, i).toLowerCase();
-//// char[] chars = new char[i-start];
-//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
-//// text = new String(chars);
-// } else {
-// text = s.substring(start, i);
-// }
- }
- } while (text != null && isStopWord(text));
-
- pos = i;
- return text != null ? new Token(text, start, i) : null;
- }
-
- private boolean isTokenChar(char c, boolean isLetter) {
- return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
- }
-
- private boolean isStopWord(String text) {
- return stopWords != null && stopWords.contains(text);
- }
-
- }
+ public Token next() {
+ // cache loop instance vars (performance)
+ String s = str;
+ int len = s.length();
+ int i = pos;
+ boolean letter = isLetter;
+
+ int start = 0;
+ String text;
+ do {
+ // find beginning of token
+ text = null;
+ while (i < len && !isTokenChar(s.charAt(i), letter)) {
+ i++;
+ }
+
+ if (i < len) { // found beginning; now find end of token
+ start = i;
+ while (i < len && isTokenChar(s.charAt(i), letter)) {
+ i++;
+ }
+
+ text = s.substring(start, i);
+ if (toLowerCase) text = text.toLowerCase(locale);
+// if (toLowerCase) {
+//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
+//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
+// text = s.substring(start, i).toLowerCase();
+//// char[] chars = new char[i-start];
+//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
+//// text = new String(chars);
+// } else {
+// text = s.substring(start, i);
+// }
+ }
+ } while (text != null && isStopWord(text));
+
+ pos = i;
+ return text != null ? new Token(text, start, i) : null;
+ }
+
+ private boolean isTokenChar(char c, boolean isLetter) {
+ return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
+ }
+
+ private boolean isStopWord(String text) {
+ return stopWords != null && stopWords.contains(text);
+ }
+
+ }
-
- ///////////////////////////////////////////////////////////////////////////////
- // Nested classes:
- ///////////////////////////////////////////////////////////////////////////////
- /**
- * A StringReader that exposes it's contained string for fast direct access.
- * Might make sense to generalize this to CharSequence and make it public?
- */
- static final class FastStringReader extends StringReader {
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Nested classes:
+ ///////////////////////////////////////////////////////////////////////////////
+ /**
+ * A StringReader that exposes it's contained string for fast direct access.
+ * Might make sense to generalize this to CharSequence and make it public?
+ */
+ static final class FastStringReader extends StringReader {
- private final String s;
-
- FastStringReader(String s) {
- super(s);
- this.s = s;
- }
-
- String getString() {
- return s;
- }
- }
-
+ private final String s;
+
+ FastStringReader(String s) {
+ super(s);
+ this.s = s;
+ }
+
+ String getString() {
+ return s;
+ }
+ }
+
}
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java Sun Jun 11 22:46:16 2006
@@ -75,325 +75,325 @@
*/
public class SynonymMap {
- /** the index data; Map<String word, String[] synonyms> */
- private final HashMap table;
-
- private static final String[] EMPTY = new String[0];
-
- private static final boolean DEBUG = false;
+ /** the index data; Map<String word, String[] synonyms> */
+ private final HashMap table;
+
+ private static final String[] EMPTY = new String[0];
+
+ private static final boolean DEBUG = false;
- /**
- * Constructs an instance, loading WordNet synonym data from the given input
- * stream. Finally closes the stream. The words in the stream must be in
- * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
- *
- * @param input
- * the stream to read from (null indicates an empty synonym map)
- * @throws IOException
- * if an error occured while reading the stream.
- */
- public SynonymMap(InputStream input) throws IOException {
- this.table = input == null ? new HashMap(0) : read(toByteArray(input));
- }
-
- /**
- * Returns the synonym set for the given word, sorted ascending.
- *
- * @param word
- * the word to lookup (must be in lowercase).
- * @return the synonyms; a set of zero or more words, sorted ascending, each
- * word containing lowercase characters that satisfy
- * <code>Character.isLetter()</code>.
- */
- public String[] getSynonyms(String word) {
- Object syns = table.get(word);
- if (syns == null) return EMPTY;
- if (syns instanceof String) return new String[] {(String) syns};
-
- String[] synonyms = (String[]) syns;
- String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
- System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
- return copy;
- }
-
- /**
- * Returns a String representation of the index data for debugging purposes.
- *
- * @return a String representation
- */
- public String toString() {
- StringBuffer buf = new StringBuffer();
- Iterator iter = new TreeMap(table).keySet().iterator();
- int count = 0;
- int f0 = 0;
- int f1 = 0;
- int f2 = 0;
- int f3 = 0;
-
- while (iter.hasNext()) {
- String word = (String) iter.next();
- buf.append(word + ":");
- String[] synonyms = getSynonyms(word);
- buf.append(Arrays.asList(synonyms));
- buf.append("\n");
- count += synonyms.length;
- if (synonyms.length == 0) f0++;
- if (synonyms.length == 1) f1++;
- if (synonyms.length == 2) f2++;
- if (synonyms.length == 3) f3++;
- }
-
- buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
- return buf.toString();
- }
-
- /**
- * Analyzes/transforms the given word on input stream loading. This default implementation simply
- * lowercases the word. Override this method with a custom stemming
- * algorithm or similar, if desired.
- *
- * @param word
- * the word to analyze
- * @return the same word, or a different word (or null to indicate that the
- * word should be ignored)
- */
- protected String analyze(String word) {
- return word.toLowerCase();
- }
+ /**
+ * Constructs an instance, loading WordNet synonym data from the given input
+ * stream. Finally closes the stream. The words in the stream must be in
+ * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
+ *
+ * @param input
+ * the stream to read from (null indicates an empty synonym map)
+ * @throws IOException
+ * if an error occured while reading the stream.
+ */
+ public SynonymMap(InputStream input) throws IOException {
+ this.table = input == null ? new HashMap(0) : read(toByteArray(input));
+ }
+
+ /**
+ * Returns the synonym set for the given word, sorted ascending.
+ *
+ * @param word
+ * the word to lookup (must be in lowercase).
+ * @return the synonyms; a set of zero or more words, sorted ascending, each
+ * word containing lowercase characters that satisfy
+ * <code>Character.isLetter()</code>.
+ */
+ public String[] getSynonyms(String word) {
+ Object syns = table.get(word);
+ if (syns == null) return EMPTY;
+ if (syns instanceof String) return new String[] {(String) syns};
+
+ String[] synonyms = (String[]) syns;
+ String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
+ System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
+ return copy;
+ }
+
+ /**
+ * Returns a String representation of the index data for debugging purposes.
+ *
+ * @return a String representation
+ */
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ Iterator iter = new TreeMap(table).keySet().iterator();
+ int count = 0;
+ int f0 = 0;
+ int f1 = 0;
+ int f2 = 0;
+ int f3 = 0;
+
+ while (iter.hasNext()) {
+ String word = (String) iter.next();
+ buf.append(word + ":");
+ String[] synonyms = getSynonyms(word);
+ buf.append(Arrays.asList(synonyms));
+ buf.append("\n");
+ count += synonyms.length;
+ if (synonyms.length == 0) f0++;
+ if (synonyms.length == 1) f1++;
+ if (synonyms.length == 2) f2++;
+ if (synonyms.length == 3) f3++;
+ }
+
+ buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
+ return buf.toString();
+ }
+
+ /**
+ * Analyzes/transforms the given word on input stream loading. This default implementation simply
+ * lowercases the word. Override this method with a custom stemming
+ * algorithm or similar, if desired.
+ *
+ * @param word
+ * the word to analyze
+ * @return the same word, or a different word (or null to indicate that the
+ * word should be ignored)
+ */
+ protected String analyze(String word) {
+ return word.toLowerCase();
+ }
- private static boolean isValid(String str) {
- for (int i=str.length(); --i >= 0; ) {
- if (!Character.isLetter(str.charAt(i))) return false;
- }
- return true;
- }
+ private static boolean isValid(String str) {
+ for (int i=str.length(); --i >= 0; ) {
+ if (!Character.isLetter(str.charAt(i))) return false;
+ }
+ return true;
+ }
- private HashMap read(byte[] data) {
- int WORDS = (int) (76401 / 0.7); // presizing
- int GROUPS = (int) (88022 / 0.7); // presizing
- HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
- HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
- HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
+ private HashMap read(byte[] data) {
+ int WORDS = (int) (76401 / 0.7); // presizing
+ int GROUPS = (int) (88022 / 0.7); // presizing
+ HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
+ HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
+ HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
- Charset charset = Charset.forName("UTF-8");
- int lastNum = -1;
- Integer lastGroup = null;
- int len = data.length;
- int i=0;
-
- while (i < len) { // until EOF
- /* Part A: Parse a line */
-
- // scan to beginning of group
- while (i < len && data[i] != '(') i++;
- if (i >= len) break; // EOF
- i++;
-
- // parse group
- int num = 0;
- while (i < len && data[i] != ',') {
- num = 10*num + (data[i] - 48);
- i++;
- }
- i++;
-// if (DEBUG) System.err.println("num="+ num);
-
- // scan to beginning of word
- while (i < len && data[i] != '\'') i++;
- i++;
-
- // scan to end of word
- int start = i;
- do {
- while (i < len && data[i] != '\'') i++;
- i++;
- } while (i < len && data[i] != ','); // word must end with "',"
-
- if (i >= len) break; // EOF
- String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
-// String word = new String(data, 0, start, i-start-1); // ASCII
-
- /*
- * Part B: ignore phrases (with spaces and hyphens) and
- * non-alphabetic words, and let user customize word (e.g. do some
- * stemming)
- */
- if (!isValid(word)) continue; // ignore
- word = analyze(word);
- if (word == null || word.length() == 0) continue; // ignore
-
-
- /* Part C: Add (group,word) to tables */
-
- // ensure compact string representation, minimizing memory overhead
- String w = (String) internedWords.get(word);
- if (w == null) {
- word = new String(word); // ensure compact string
- internedWords.put(word, word);
- } else {
- word = w;
- }
-
- Integer group = lastGroup;
- if (num != lastNum) {
- group = new Integer(num);
- lastGroup = group;
- lastNum = num;
- }
-
- // add word --> group
- ArrayList groups = (ArrayList) word2Groups.get(word);
- if (groups == null) {
- groups = new ArrayList(1);
- word2Groups.put(word, groups);
- }
- groups.add(group);
+ Charset charset = Charset.forName("UTF-8");
+ int lastNum = -1;
+ Integer lastGroup = null;
+ int len = data.length;
+ int i=0;
+
+ while (i < len) { // until EOF
+ /* Part A: Parse a line */
+
+ // scan to beginning of group
+ while (i < len && data[i] != '(') i++;
+ if (i >= len) break; // EOF
+ i++;
+
+ // parse group
+ int num = 0;
+ while (i < len && data[i] != ',') {
+ num = 10*num + (data[i] - 48);
+ i++;
+ }
+ i++;
+// if (DEBUG) System.err.println("num="+ num);
+
+ // scan to beginning of word
+ while (i < len && data[i] != '\'') i++;
+ i++;
+
+ // scan to end of word
+ int start = i;
+ do {
+ while (i < len && data[i] != '\'') i++;
+ i++;
+ } while (i < len && data[i] != ','); // word must end with "',"
+
+ if (i >= len) break; // EOF
+ String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
+// String word = new String(data, 0, start, i-start-1); // ASCII
+
+ /*
+ * Part B: ignore phrases (with spaces and hyphens) and
+ * non-alphabetic words, and let user customize word (e.g. do some
+ * stemming)
+ */
+ if (!isValid(word)) continue; // ignore
+ word = analyze(word);
+ if (word == null || word.length() == 0) continue; // ignore
+
+
+ /* Part C: Add (group,word) to tables */
+
+ // ensure compact string representation, minimizing memory overhead
+ String w = (String) internedWords.get(word);
+ if (w == null) {
+ word = new String(word); // ensure compact string
+ internedWords.put(word, word);
+ } else {
+ word = w;
+ }
+
+ Integer group = lastGroup;
+ if (num != lastNum) {
+ group = new Integer(num);
+ lastGroup = group;
+ lastNum = num;
+ }
+
+ // add word --> group
+ ArrayList groups = (ArrayList) word2Groups.get(word);
+ if (groups == null) {
+ groups = new ArrayList(1);
+ word2Groups.put(word, groups);
+ }
+ groups.add(group);
- // add group --> word
- ArrayList words = (ArrayList) group2Words.get(group);
- if (words == null) {
- words = new ArrayList(1);
- group2Words.put(group, words);
- }
- words.add(word);
- }
-
-
- /* Part D: compute index data structure */
- HashMap word2Syns = createIndex(word2Groups, group2Words);
-
- /* Part E: minimize memory consumption by a factor 3 (or so) */
-// if (true) return word2Syns;
- word2Groups = null; // help gc
- group2Words = null; // help gc
- return optimize(word2Syns, internedWords);
- }
-
- private HashMap createIndex(Map word2Groups, Map group2Words) {
- HashMap word2Syns = new HashMap();
- Iterator iter = word2Groups.entrySet().iterator();
-
- while (iter.hasNext()) { // for each word
- Map.Entry entry = (Map.Entry) iter.next();
- ArrayList group = (ArrayList) entry.getValue();
- String word = (String) entry.getKey();
-
-// HashSet synonyms = new HashSet();
- TreeSet synonyms = new TreeSet();
- for (int i=group.size(); --i >= 0; ) { // for each groupID of word
- ArrayList words = (ArrayList) group2Words.get(group.get(i));
- for (int j=words.size(); --j >= 0; ) { // add all words
- Object synonym = words.get(j); // note that w and word are interned
- if (synonym != word) { // a word is implicitly it's own synonym
- synonyms.add(synonym);
- }
- }
- }
+ // add group --> word
+ ArrayList words = (ArrayList) group2Words.get(group);
+ if (words == null) {
+ words = new ArrayList(1);
+ group2Words.put(group, words);
+ }
+ words.add(word);
+ }
+
+
+ /* Part D: compute index data structure */
+ HashMap word2Syns = createIndex(word2Groups, group2Words);
+
+ /* Part E: minimize memory consumption by a factor 3 (or so) */
+// if (true) return word2Syns;
+ word2Groups = null; // help gc
+ group2Words = null; // help gc
+ return optimize(word2Syns, internedWords);
+ }
+
+ private HashMap createIndex(Map word2Groups, Map group2Words) {
+ HashMap word2Syns = new HashMap();
+ Iterator iter = word2Groups.entrySet().iterator();
+
+ while (iter.hasNext()) { // for each word
+ Map.Entry entry = (Map.Entry) iter.next();
+ ArrayList group = (ArrayList) entry.getValue();
+ String word = (String) entry.getKey();
+
+// HashSet synonyms = new HashSet();
+ TreeSet synonyms = new TreeSet();
+ for (int i=group.size(); --i >= 0; ) { // for each groupID of word
+ ArrayList words = (ArrayList) group2Words.get(group.get(i));
+ for (int j=words.size(); --j >= 0; ) { // add all words
+ Object synonym = words.get(j); // note that w and word are interned
+ if (synonym != word) { // a word is implicitly it's own synonym
+ synonyms.add(synonym);
+ }
+ }
+ }
- int size = synonyms.size();
- if (size > 0) {
- String[] syns = new String[size];
- if (size == 1)
- syns[0] = (String) synonyms.first();
- else
- synonyms.toArray(syns);
-// if (syns.length > 1) Arrays.sort(syns);
-// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
- word2Syns.put(word, syns);
- }
- }
-
- return word2Syns;
- }
+ int size = synonyms.size();
+ if (size > 0) {
+ String[] syns = new String[size];
+ if (size == 1)
+ syns[0] = (String) synonyms.first();
+ else
+ synonyms.toArray(syns);
+// if (syns.length > 1) Arrays.sort(syns);
+// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
+ word2Syns.put(word, syns);
+ }
+ }
+
+ return word2Syns;
+ }
- private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
- if (DEBUG) {
- System.err.println("before gc");
- for (int i=0; i < 10; i++) System.gc();
- System.err.println("after gc");
- }
-
- // collect entries
- int len = 0;
- int size = word2Syns.size();
- String[][] allSynonyms = new String[size][];
- String[] words = new String[size];
- Iterator iter = word2Syns.entrySet().iterator();
- for (int j=0; j < size; j++) {
- Map.Entry entry = (Map.Entry) iter.next();
- allSynonyms[j] = (String[]) entry.getValue();
- words[j] = (String) entry.getKey();
- len += words[j].length();
- }
-
- // assemble large string containing all words
- StringBuffer buf = new StringBuffer(len);
- for (int j=0; j < size; j++) buf.append(words[j]);
- String allWords = new String(buf.toString()); // ensure compact string across JDK versions
- buf = null;
-
- // intern words at app level via memory-overlaid substrings
- for (int p=0, j=0; j < size; j++) {
- String word = words[j];
- internedWords.put(word, allWords.substring(p, p + word.length()));
- p += word.length();
- }
-
- // replace words with interned words
- for (int j=0; j < size; j++) {
- String[] syns = allSynonyms[j];
- for (int k=syns.length; --k >= 0; ) {
- syns[k] = (String) internedWords.get(syns[k]);
- }
- Object replacement = syns;
- if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
- word2Syns.remove(words[j]);
- word2Syns.put(internedWords.get(words[j]), replacement);
- }
-
- if (DEBUG) {
- words = null;
- allSynonyms = null;
- internedWords = null;
- allWords = null;
- System.err.println("before gc");
- for (int i=0; i < 10; i++) System.gc();
- System.err.println("after gc");
- }
- return word2Syns;
- }
-
- // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
- private static byte[] toByteArray(InputStream input) throws IOException {
- try {
- // safe and fast even if input.available() behaves weird or buggy
- int len = Math.max(256, input.available());
- byte[] buffer = new byte[len];
- byte[] output = new byte[len];
-
- len = 0;
- int n;
- while ((n = input.read(buffer)) >= 0) {
- if (len + n > output.length) { // grow capacity
- byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
- System.arraycopy(output, 0, tmp, 0, len);
- System.arraycopy(buffer, 0, tmp, len, n);
- buffer = output; // use larger buffer for future larger bulk reads
- output = tmp;
- } else {
- System.arraycopy(buffer, 0, output, len, n);
- }
- len += n;
- }
+ private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
+ if (DEBUG) {
+ System.err.println("before gc");
+ for (int i=0; i < 10; i++) System.gc();
+ System.err.println("after gc");
+ }
+
+ // collect entries
+ int len = 0;
+ int size = word2Syns.size();
+ String[][] allSynonyms = new String[size][];
+ String[] words = new String[size];
+ Iterator iter = word2Syns.entrySet().iterator();
+ for (int j=0; j < size; j++) {
+ Map.Entry entry = (Map.Entry) iter.next();
+ allSynonyms[j] = (String[]) entry.getValue();
+ words[j] = (String) entry.getKey();
+ len += words[j].length();
+ }
+
+ // assemble large string containing all words
+ StringBuffer buf = new StringBuffer(len);
+ for (int j=0; j < size; j++) buf.append(words[j]);
+ String allWords = new String(buf.toString()); // ensure compact string across JDK versions
+ buf = null;
+
+ // intern words at app level via memory-overlaid substrings
+ for (int p=0, j=0; j < size; j++) {
+ String word = words[j];
+ internedWords.put(word, allWords.substring(p, p + word.length()));
+ p += word.length();
+ }
+
+ // replace words with interned words
+ for (int j=0; j < size; j++) {
+ String[] syns = allSynonyms[j];
+ for (int k=syns.length; --k >= 0; ) {
+ syns[k] = (String) internedWords.get(syns[k]);
+ }
+ Object replacement = syns;
+ if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
+ word2Syns.remove(words[j]);
+ word2Syns.put(internedWords.get(words[j]), replacement);
+ }
+
+ if (DEBUG) {
+ words = null;
+ allSynonyms = null;
+ internedWords = null;
+ allWords = null;
+ System.err.println("before gc");
+ for (int i=0; i < 10; i++) System.gc();
+ System.err.println("after gc");
+ }
+ return word2Syns;
+ }
+
+ // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+ private static byte[] toByteArray(InputStream input) throws IOException {
+ try {
+ // safe and fast even if input.available() behaves weird or buggy
+ int len = Math.max(256, input.available());
+ byte[] buffer = new byte[len];
+ byte[] output = new byte[len];
+
+ len = 0;
+ int n;
+ while ((n = input.read(buffer)) >= 0) {
+ if (len + n > output.length) { // grow capacity
+ byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+ System.arraycopy(output, 0, tmp, 0, len);
+ System.arraycopy(buffer, 0, tmp, len, n);
+ buffer = output; // use larger buffer for future larger bulk reads
+ output = tmp;
+ } else {
+ System.arraycopy(buffer, 0, output, len, n);
+ }
+ len += n;
+ }
- if (len == output.length) return output;
- buffer = null; // help gc
- buffer = new byte[len];
- System.arraycopy(output, 0, buffer, 0, len);
- return buffer;
- } finally {
- if (input != null) input.close();
- }
- }
-
+ if (len == output.length) return output;
+ buffer = null; // help gc
+ buffer = new byte[len];
+ System.arraycopy(output, 0, buffer, 0, len);
+ return buffer;
+ } finally {
+ if (input != null) input.close();
+ }
+ }
+
}
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java Sun Jun 11 22:46:16 2006
@@ -30,105 +30,105 @@
* @author whoschek.AT.lbl.DOT.gov
*/
public class SynonymTokenFilter extends TokenFilter {
-
- /** The Token.type used to indicate a synonym to higher level filters. */
- public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
+
+ /** The Token.type used to indicate a synonym to higher level filters. */
+ public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
- private final SynonymMap synonyms;
- private final int maxSynonyms;
-
- private String[] stack = null;
- private int index = 0;
- private Token current = null;
- private int todo = 0;
-
- /**
- * Creates an instance for the given underlying stream and synonym table.
- *
- * @param input
- * the underlying child token stream
- * @param synonyms
- * the map used to extract synonyms for terms
- * @param maxSynonyms
- * the maximum number of synonym tokens to return per underlying
- * token word (a value of Integer.MAX_VALUE indicates unlimited)
- */
- public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
- super(input);
- if (input == null)
- throw new IllegalArgumentException("input must not be null");
- if (synonyms == null)
- throw new IllegalArgumentException("synonyms must not be null");
- if (maxSynonyms < 0)
- throw new IllegalArgumentException("maxSynonyms must not be negative");
-
- this.synonyms = synonyms;
- this.maxSynonyms = maxSynonyms;
- }
-
- /** Returns the next token in the stream, or null at EOS. */
- public Token next() throws IOException {
- Token token;
- while (todo > 0 && index < stack.length) { // pop from stack
- token = createToken(stack[index++], current);
- if (token != null) {
- todo--;
- return token;
- }
- }
-
- token = input.next();
- if (token == null) return null; // EOS; iterator exhausted
-
- stack = synonyms.getSynonyms(token.termText()); // push onto stack
- if (stack.length > maxSynonyms) randomize(stack);
- index = 0;
- current = token;
- todo = maxSynonyms;
- return token;
- }
-
- /**
- * Creates and returns a token for the given synonym of the current input
- * token; Override for custom (stateless or stateful) behaviour, if desired.
- *
- * @param synonym
- * a synonym for the current token's term
- * @param current
- * the current token from the underlying child stream
- * @return a new token, or null to indicate that the given synonym should be
- * ignored
- */
- protected Token createToken(String synonym, Token current) {
- Token token = new Token(
- synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
- token.setPositionIncrement(0);
- return token;
- }
-
- /**
- * Randomize synonyms to later sample a subset. Uses constant random seed
- * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
- * number generator with medium statistical quality (multiplicative
- * congruential method), producing integers in the range [Integer.MIN_VALUE,
- * Integer.MAX_VALUE].
- */
- private static void randomize(Object[] arr) {
- int seed = 1234567; // constant
- int randomState = 4*seed + 1;
-// Random random = new Random(seed); // unnecessary overhead
- int len = arr.length;
- for (int i=0; i < len-1; i++) {
- randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
- int r = randomState % (len-i);
- if (r < 0) r = -r; // e.g. -9 % 2 == -1
-// int r = random.nextInt(len-i);
-
- // swap arr[i, i+r]
- Object tmp = arr[i];
- arr[i] = arr[i + r];
- arr[i + r] = tmp;
- }
- }
-
+ private final SynonymMap synonyms;
+ private final int maxSynonyms;
+
+ private String[] stack = null;
+ private int index = 0;
+ private Token current = null;
+ private int todo = 0;
+
+ /**
+ * Creates an instance for the given underlying stream and synonym table.
+ *
+ * @param input
+ * the underlying child token stream
+ * @param synonyms
+ * the map used to extract synonyms for terms
+ * @param maxSynonyms
+ * the maximum number of synonym tokens to return per underlying
+ * token word (a value of Integer.MAX_VALUE indicates unlimited)
+ */
+ public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
+ super(input);
+ if (input == null)
+ throw new IllegalArgumentException("input must not be null");
+ if (synonyms == null)
+ throw new IllegalArgumentException("synonyms must not be null");
+ if (maxSynonyms < 0)
+ throw new IllegalArgumentException("maxSynonyms must not be negative");
+
+ this.synonyms = synonyms;
+ this.maxSynonyms = maxSynonyms;
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public Token next() throws IOException {
+ Token token;
+ while (todo > 0 && index < stack.length) { // pop from stack
+ token = createToken(stack[index++], current);
+ if (token != null) {
+ todo--;
+ return token;
+ }
+ }
+
+ token = input.next();
+ if (token == null) return null; // EOS; iterator exhausted
+
+ stack = synonyms.getSynonyms(token.termText()); // push onto stack
+ if (stack.length > maxSynonyms) randomize(stack);
+ index = 0;
+ current = token;
+ todo = maxSynonyms;
+ return token;
+ }
+
+ /**
+ * Creates and returns a token for the given synonym of the current input
+ * token; Override for custom (stateless or stateful) behaviour, if desired.
+ *
+ * @param synonym
+ * a synonym for the current token's term
+ * @param current
+ * the current token from the underlying child stream
+ * @return a new token, or null to indicate that the given synonym should be
+ * ignored
+ */
+ protected Token createToken(String synonym, Token current) {
+ Token token = new Token(
+ synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
+ token.setPositionIncrement(0);
+ return token;
+ }
+
+ /**
+ * Randomize synonyms to later sample a subset. Uses constant random seed
+ * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
+ * number generator with medium statistical quality (multiplicative
+ * congruential method), producing integers in the range [Integer.MIN_VALUE,
+ * Integer.MAX_VALUE].
+ */
+ private static void randomize(Object[] arr) {
+ int seed = 1234567; // constant
+ int randomState = 4*seed + 1;
+// Random random = new Random(seed); // unnecessary overhead
+ int len = arr.length;
+ for (int i=0; i < len-1; i++) {
+ randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
+ int r = randomState % (len-i);
+ if (r < 0) r = -r; // e.g. -9 % 2 == -1
+// int r = random.nextInt(len-i);
+
+ // swap arr[i, i+r]
+ Object tmp = arr[i];
+ arr[i] = arr[i + r];
+ arr[i + r] = tmp;
+ }
+ }
+
}
Modified: lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (original)
+++ lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java Sun Jun 11 22:46:16 2006
@@ -197,319 +197,319 @@
@author whoschek.AT.lbl.DOT.gov
*/
public class MemoryIndexTest extends TestCase {
-
- private Analyzer analyzer;
- private boolean fastMode = false;
-
- private static final String FIELD_NAME = "content";
+
+ private Analyzer analyzer;
+ private boolean fastMode = false;
+
+ private static final String FIELD_NAME = "content";
- /** Runs the tests and/or benchmark */
- public static void main(String[] args) throws Throwable {
- new MemoryIndexTest().run(args);
- }
+ /** Runs the tests and/or benchmark */
+ public static void main(String[] args) throws Throwable {
+ new MemoryIndexTest().run(args);
+ }
-// public void setUp() { }
-// public void tearDown() {}
-
- public void testMany() throws Throwable {
- String[] files = listFiles(new String[] {
- "*.txt", "*.html", "*.xml", "xdocs/*.xml",
- "src/java/test/org/apache/lucene/queryParser/*.java",
- "src/java/org/apache/lucene/index/memory/*.java",
- });
- System.out.println("files = " + java.util.Arrays.asList(files));
- String[] xargs = new String[] {
- "1", "1", "memram",
- "@src/test/org/apache/lucene/index/memory/testqueries.txt",
- };
- String[] args = new String[xargs.length + files.length];
- System.arraycopy(xargs, 0, args, 0, xargs.length);
- System.arraycopy(files, 0, args, xargs.length, files.length);
- run(args);
- }
-
- private void run(String[] args) throws Throwable {
- int k = -1;
-
- int iters = 1;
- if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
-
- int runs = 1;
- if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
-
- String cmd = "memram";
- if (args.length > ++k) cmd = args[k];
- boolean useMemIndex = cmd.indexOf("mem") >= 0;
- boolean useRAMIndex = cmd.indexOf("ram") >= 0;
-
- String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
- if (args.length > ++k) {
- String arg = args[k];
- if (arg.startsWith("@"))
- queries = readLines(new File(arg.substring(1)));
- else
- queries = new String[] { arg };
- }
-
- File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
- if (args.length > ++k) {
- files = new File[args.length - k];
- for (int i=k; i < args.length; i++) {
- files[i-k] = new File(args[i]);
- }
- }
-
- boolean toLowerCase = true;
-// boolean toLowerCase = false;
-// Set stopWords = null;
- Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
-
- Analyzer[] analyzers = new Analyzer[] {
- new SimpleAnalyzer(),
- new StopAnalyzer(),
- new StandardAnalyzer(),
- PatternAnalyzer.DEFAULT_ANALYZER,
-// new WhitespaceAnalyzer(),
-// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
-// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
-// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
- };
-
- for (int iter=0; iter < iters; iter++) {
- System.out.println("\n########### iteration=" + iter);
- long start = System.currentTimeMillis();
- long bytes = 0;
-
- for (int anal=0; anal < analyzers.length; anal++) {
- this.analyzer = analyzers[anal];
-
- for (int i=0; i < files.length; i++) {
- File file = files[i];
- if (!file.exists() || file.isDirectory()) continue; // ignore
- bytes += file.length();
- String text = toString(new FileInputStream(file), null);
- Document doc = createDocument(text);
- System.out.println("\n*********** FILE=" + file);
-
- for (int q=0; q < queries.length; q++) {
- try {
- Query query = parseQuery(queries[q]);
-
- for (int run=0; run < runs; run++) {
- float score1 = 0.0f; float score2 = 0.0f;
- if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
- if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
- if (useMemIndex && useRAMIndex) {
- System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
- if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
- throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
- }
- }
- }
- } catch (Throwable t) {
- if (t instanceof OutOfMemoryError) t.printStackTrace();
- System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
- throw t;
- }
- }
- }
- }
- long end = System.currentTimeMillis();
- System.out.println("\nsecs = " + ((end-start)/1000.0f));
- System.out.println("queries/sec= " +
- (1.0f * runs * queries.length * analyzers.length * files.length
- / ((end-start)/1000.0f)));
- float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
- System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
- }
-
- if (useMemIndex && useRAMIndex)
- System.out.println("No bug found. done.");
- else
- System.out.println("Done benchmarking (without checking correctness).");
- }
-
- // returns file line by line, ignoring empty lines and comments
- private String[] readLines(File file) throws Exception {
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- new FileInputStream(file)));
- ArrayList lines = new ArrayList();
- String line;
- while ((line = reader.readLine()) != null) {
- String t = line.trim();
- if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
- lines.add(line);
- }
- }
- reader.close();
-
- String[] result = new String[lines.size()];
- lines.toArray(result);
- return result;
- }
-
- private Document createDocument(String content) {
- Document doc = new Document();
- doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
- return doc;
- }
-
- private MemoryIndex createMemoryIndex(Document doc) {
- MemoryIndex index = new MemoryIndex();
- Enumeration iter = doc.fields();
- while (iter.hasMoreElements()) {
- Field field = (Field) iter.nextElement();
- index.addField(field.name(), field.stringValue(), analyzer);
- }
- return index;
- }
-
- private RAMDirectory createRAMIndex(Document doc) {
- RAMDirectory dir = new RAMDirectory();
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(dir, analyzer, true);
- writer.setMaxFieldLength(Integer.MAX_VALUE);
- writer.addDocument(doc);
- writer.optimize();
- return dir;
- } catch (IOException e) { // should never happen (RAMDirectory)
- throw new RuntimeException(e);
- } finally {
- try {
- if (writer != null) writer.close();
- } catch (IOException e) { // should never happen (RAMDirectory)
- throw new RuntimeException(e);
- }
- }
- }
-
- private float query(Object index, Query query) {
-// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
- Searcher searcher = null;
- try {
- if (index instanceof Directory)
- searcher = new IndexSearcher((Directory)index);
- else
- searcher = ((MemoryIndex) index).createSearcher();
+// public void setUp() { }
+// public void tearDown() {}
+
+ public void testMany() throws Throwable {
+ String[] files = listFiles(new String[] {
+ "*.txt", "*.html", "*.xml", "xdocs/*.xml",
+ "src/java/test/org/apache/lucene/queryParser/*.java",
+ "src/java/org/apache/lucene/index/memory/*.java",
+ });
+ System.out.println("files = " + java.util.Arrays.asList(files));
+ String[] xargs = new String[] {
+ "1", "1", "memram",
+ "@src/test/org/apache/lucene/index/memory/testqueries.txt",
+ };
+ String[] args = new String[xargs.length + files.length];
+ System.arraycopy(xargs, 0, args, 0, xargs.length);
+ System.arraycopy(files, 0, args, xargs.length, files.length);
+ run(args);
+ }
+
+ private void run(String[] args) throws Throwable {
+ int k = -1;
+
+ int iters = 1;
+ if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
+
+ int runs = 1;
+ if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
+
+ String cmd = "memram";
+ if (args.length > ++k) cmd = args[k];
+ boolean useMemIndex = cmd.indexOf("mem") >= 0;
+ boolean useRAMIndex = cmd.indexOf("ram") >= 0;
+
+ String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
+ if (args.length > ++k) {
+ String arg = args[k];
+ if (arg.startsWith("@"))
+ queries = readLines(new File(arg.substring(1)));
+ else
+ queries = new String[] { arg };
+ }
+
+ File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
+ if (args.length > ++k) {
+ files = new File[args.length - k];
+ for (int i=k; i < args.length; i++) {
+ files[i-k] = new File(args[i]);
+ }
+ }
+
+ boolean toLowerCase = true;
+// boolean toLowerCase = false;
+// Set stopWords = null;
+ Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+
+ Analyzer[] analyzers = new Analyzer[] {
+ new SimpleAnalyzer(),
+ new StopAnalyzer(),
+ new StandardAnalyzer(),
+ PatternAnalyzer.DEFAULT_ANALYZER,
+// new WhitespaceAnalyzer(),
+// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
+// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
+// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
+ };
+
+ for (int iter=0; iter < iters; iter++) {
+ System.out.println("\n########### iteration=" + iter);
+ long start = System.currentTimeMillis();
+ long bytes = 0;
+
+ for (int anal=0; anal < analyzers.length; anal++) {
+ this.analyzer = analyzers[anal];
+
+ for (int i=0; i < files.length; i++) {
+ File file = files[i];
+ if (!file.exists() || file.isDirectory()) continue; // ignore
+ bytes += file.length();
+ String text = toString(new FileInputStream(file), null);
+ Document doc = createDocument(text);
+ System.out.println("\n*********** FILE=" + file);
+
+ for (int q=0; q < queries.length; q++) {
+ try {
+ Query query = parseQuery(queries[q]);
+
+ for (int run=0; run < runs; run++) {
+ float score1 = 0.0f; float score2 = 0.0f;
+ if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
+ if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
+ if (useMemIndex && useRAMIndex) {
+ System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
+ if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
+ throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+ }
+ }
+ }
+ } catch (Throwable t) {
+ if (t instanceof OutOfMemoryError) t.printStackTrace();
+ System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+ throw t;
+ }
+ }
+ }
+ }
+ long end = System.currentTimeMillis();
+ System.out.println("\nsecs = " + ((end-start)/1000.0f));
+ System.out.println("queries/sec= " +
+ (1.0f * runs * queries.length * analyzers.length * files.length
+ / ((end-start)/1000.0f)));
+ float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
+ System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
+ }
+
+ if (useMemIndex && useRAMIndex)
+ System.out.println("No bug found. done.");
+ else
+ System.out.println("Done benchmarking (without checking correctness).");
+ }
+
+ // returns file line by line, ignoring empty lines and comments
+ private String[] readLines(File file) throws Exception {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(file)));
+ ArrayList lines = new ArrayList();
+ String line;
+ while ((line = reader.readLine()) != null) {
+ String t = line.trim();
+ if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
+ lines.add(line);
+ }
+ }
+ reader.close();
+
+ String[] result = new String[lines.size()];
+ lines.toArray(result);
+ return result;
+ }
+
+ private Document createDocument(String content) {
+ Document doc = new Document();
+ doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+ return doc;
+ }
+
+ private MemoryIndex createMemoryIndex(Document doc) {
+ MemoryIndex index = new MemoryIndex();
+ Enumeration iter = doc.fields();
+ while (iter.hasMoreElements()) {
+ Field field = (Field) iter.nextElement();
+ index.addField(field.name(), field.stringValue(), analyzer);
+ }
+ return index;
+ }
+
+ private RAMDirectory createRAMIndex(Document doc) {
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = null;
+ try {
+ writer = new IndexWriter(dir, analyzer, true);
+ writer.setMaxFieldLength(Integer.MAX_VALUE);
+ writer.addDocument(doc);
+ writer.optimize();
+ return dir;
+ } catch (IOException e) { // should never happen (RAMDirectory)
+ throw new RuntimeException(e);
+ } finally {
+ try {
+ if (writer != null) writer.close();
+ } catch (IOException e) { // should never happen (RAMDirectory)
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private float query(Object index, Query query) {
+// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
+ Searcher searcher = null;
+ try {
+ if (index instanceof Directory)
+ searcher = new IndexSearcher((Directory)index);
+ else
+ searcher = ((MemoryIndex) index).createSearcher();
- final float[] scores = new float[1]; // inits to 0.0f
- searcher.search(query, new HitCollector() {
- public void collect(int doc, float score) {
- scores[0] = score;
- }
- });
- float score = scores[0];
-// Hits hits = searcher.search(query);
-// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
- return score;
- } catch (IOException e) { // should never happen (RAMDirectory)
- throw new RuntimeException(e);
- } finally {
- try {
- if (searcher != null) searcher.close();
- } catch (IOException e) { // should never happen (RAMDirectory)
- throw new RuntimeException(e);
- }
- }
- }
-
- private int getMemorySize(Object index) {
- if (index instanceof Directory) {
- try {
- Directory dir = (Directory) index;
- int size = 0;
- String[] fileNames = dir.list();
- for (int i=0; i < fileNames.length; i++) {
- size += dir.fileLength(fileNames[i]);
- }
- return size;
- }
- catch (IOException e) { // can never happen (RAMDirectory)
- throw new RuntimeException(e);
- }
- }
- else {
- return ((MemoryIndex) index).getMemorySize();
- }
- }
-
- private Query parseQuery(String expression) throws ParseException {
- QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
-// parser.setPhraseSlop(0);
- return parser.parse(expression);
- }
-
- /** returns all files matching the given file name patterns (quick n'dirty) */
- static String[] listFiles(String[] fileNames) {
- LinkedHashSet allFiles = new LinkedHashSet();
- for (int i=0; i < fileNames.length; i++) {
- int k;
- if ((k = fileNames[i].indexOf("*")) < 0) {
- allFiles.add(fileNames[i]);
- } else {
- String prefix = fileNames[i].substring(0, k);
- if (prefix.length() == 0) prefix = ".";
- final String suffix = fileNames[i].substring(k+1);
- File[] files = new File(prefix).listFiles(new FilenameFilter() {
- public boolean accept(File dir, String name) {
- return name.endsWith(suffix);
- }
- });
- if (files != null) {
- for (int j=0; j < files.length; j++) {
- allFiles.add(files[j].getPath());
- }
- }
- }
- }
-
- String[] result = new String[allFiles.size()];
- allFiles.toArray(result);
- return result;
- }
-
- // trick to detect default platform charset
- private static final Charset DEFAULT_PLATFORM_CHARSET =
- Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
-
- // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
- private static String toString(InputStream input, Charset charset) throws IOException {
- if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
- byte[] data = toByteArray(input);
- return charset.decode(ByteBuffer.wrap(data)).toString();
- }
-
- private static byte[] toByteArray(InputStream input) throws IOException {
- try {
- // safe and fast even if input.available() behaves weird or buggy
- int len = Math.max(256, input.available());
- byte[] buffer = new byte[len];
- byte[] output = new byte[len];
-
- len = 0;
- int n;
- while ((n = input.read(buffer)) >= 0) {
- if (len + n > output.length) { // grow capacity
- byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
- System.arraycopy(output, 0, tmp, 0, len);
- System.arraycopy(buffer, 0, tmp, len, n);
- buffer = output; // use larger buffer for future larger bulk reads
- output = tmp;
- } else {
- System.arraycopy(buffer, 0, output, len, n);
- }
- len += n;
- }
+ final float[] scores = new float[1]; // inits to 0.0f
+ searcher.search(query, new HitCollector() {
+ public void collect(int doc, float score) {
+ scores[0] = score;
+ }
+ });
+ float score = scores[0];
+// Hits hits = searcher.search(query);
+// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
+ return score;
+ } catch (IOException e) { // should never happen (RAMDirectory)
+ throw new RuntimeException(e);
+ } finally {
+ try {
+ if (searcher != null) searcher.close();
+ } catch (IOException e) { // should never happen (RAMDirectory)
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private int getMemorySize(Object index) {
+ if (index instanceof Directory) {
+ try {
+ Directory dir = (Directory) index;
+ int size = 0;
+ String[] fileNames = dir.list();
+ for (int i=0; i < fileNames.length; i++) {
+ size += dir.fileLength(fileNames[i]);
+ }
+ return size;
+ }
+ catch (IOException e) { // can never happen (RAMDirectory)
+ throw new RuntimeException(e);
+ }
+ }
+ else {
+ return ((MemoryIndex) index).getMemorySize();
+ }
+ }
+
+ private Query parseQuery(String expression) throws ParseException {
+ QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
+// parser.setPhraseSlop(0);
+ return parser.parse(expression);
+ }
+
+ /** returns all files matching the given file name patterns (quick n'dirty) */
+ static String[] listFiles(String[] fileNames) {
+ LinkedHashSet allFiles = new LinkedHashSet();
+ for (int i=0; i < fileNames.length; i++) {
+ int k;
+ if ((k = fileNames[i].indexOf("*")) < 0) {
+ allFiles.add(fileNames[i]);
+ } else {
+ String prefix = fileNames[i].substring(0, k);
+ if (prefix.length() == 0) prefix = ".";
+ final String suffix = fileNames[i].substring(k+1);
+ File[] files = new File(prefix).listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.endsWith(suffix);
+ }
+ });
+ if (files != null) {
+ for (int j=0; j < files.length; j++) {
+ allFiles.add(files[j].getPath());
+ }
+ }
+ }
+ }
+
+ String[] result = new String[allFiles.size()];
+ allFiles.toArray(result);
+ return result;
+ }
+
+ // trick to detect default platform charset
+ private static final Charset DEFAULT_PLATFORM_CHARSET =
+ Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
+
+ // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+ private static String toString(InputStream input, Charset charset) throws IOException {
+ if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
+ byte[] data = toByteArray(input);
+ return charset.decode(ByteBuffer.wrap(data)).toString();
+ }
+
+ private static byte[] toByteArray(InputStream input) throws IOException {
+ try {
+ // safe and fast even if input.available() behaves weird or buggy
+ int len = Math.max(256, input.available());
+ byte[] buffer = new byte[len];
+ byte[] output = new byte[len];
+
+ len = 0;
+ int n;
+ while ((n = input.read(buffer)) >= 0) {
+ if (len + n > output.length) { // grow capacity
+ byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+ System.arraycopy(output, 0, tmp, 0, len);
+ System.arraycopy(buffer, 0, tmp, len, n);
+ buffer = output; // use larger buffer for future larger bulk reads
+ output = tmp;
+ } else {
+ System.arraycopy(buffer, 0, output, len, n);
+ }
+ len += n;
+ }
- if (len == output.length) return output;
- buffer = null; // help gc
- buffer = new byte[len];
- System.arraycopy(output, 0, buffer, 0, len);
- return buffer;
- } finally {
- if (input != null) input.close();
- }
- }
-
+ if (len == output.length) return output;
+ buffer = null; // help gc
+ buffer = new byte[len];
+ System.arraycopy(output, 0, buffer, 0, len);
+ return buffer;
+ } finally {
+ if (input != null) input.close();
+ }
+ }
+
}