You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/03/25 03:58:15 UTC

svn commit: r1304978 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/core/src/ lucene/core/src/java/org/apache/lucene/analysis/standard/ lucene/core/src/test/org/apache/lucene/analysis/ solr/

Author: sarowe
Date: Sun Mar 25 01:58:14 2012
New Revision: 1304978

URL: http://svn.apache.org/viewvc?rev=1304978&view=rev
Log:
LUCENE-3881: Added UAX29URLEmailAnalyzer

Added:
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
      - copied, changed from r1304975, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
    lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java
      - copied, changed from r1304975, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/core/src/   (props changed)
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html
    lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1304978&r1=1304977&r2=1304978&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Mar 25 01:58:14 2012
@@ -197,6 +197,9 @@ New Features
 * LUCENE-3789: Expose MTQ TermsEnum via RewriteMethod for non package private
   access (Simon Willnauer)
   
+* LUCENE-3881: Added UAX29URLEmailAnalyzer: a standard analyzer that recognizes
+  URLs and emails. (Steve Rowe)
+
 Bug fixes
 
 * LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter

Copied: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java (from r1304975, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java?p2=lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java&p1=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java&r1=1304975&r2=1304978&rev=1304978&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java Sun Mar 25 01:58:14 2012
@@ -17,22 +17,18 @@ package org.apache.lucene.analysis.stand
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.Reader;
+import java.util.Set;
 
 /**
  * Filters {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer}
  * with {@link org.apache.lucene.analysis.standard.StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and
- * {@link org.apache.lucene.analysis.core.StopFilter}, using a list of
+ * {@link org.apache.lucene.analysis.LowerCaseFilter} and
+ * {@link org.apache.lucene.analysis.StopFilter}, using a list of
  * English stop words.
  *
  * <a name="version"/>
@@ -50,13 +46,13 @@ public final class UAX29URLEmailAnalyzer
 
   /** An unmodifiable set containing some common English words that are usually not
   useful for searching. */
-  public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+  public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
 
   /** Builds an analyzer with the given stop words.
    * @param matchVersion Lucene version to match See {@link
    * <a href="#version">above</a>}
    * @param stopWords stop words */
-  public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) {
+  public UAX29URLEmailAnalyzer(Version matchVersion, Set<?> stopWords) {
     super(matchVersion, stopWords);
   }
 
@@ -70,19 +66,19 @@ public final class UAX29URLEmailAnalyzer
   }
 
   /** Builds an analyzer with the stop words from the given reader.
-   * @see org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version)
+   * @see org.apache.lucene.analysis.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version)
    * @param matchVersion Lucene version to match See {@link
    * <a href="#version">above</a>}
    * @param stopwords Reader to read stop words from */
   public UAX29URLEmailAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
-    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
+    this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
   }
 
   /**
    * Set maximum allowed token length.  If a token is seen
    * that exceeds this length then it is discarded.  This
    * setting only takes effect the next time tokenStream or
-   * tokenStream is called.
+   * reusableTokenStream is called.
    */
   public void setMaxTokenLength(int length) {
     maxTokenLength = length;
@@ -104,9 +100,9 @@ public final class UAX29URLEmailAnalyzer
     tok = new StopFilter(matchVersion, tok, stopwords);
     return new TokenStreamComponents(src, tok) {
       @Override
-      protected void reset(final Reader reader) throws IOException {
+      protected boolean reset(final Reader reader) throws IOException {
         src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
-        super.reset(reader);
+        return super.reset(reader);
       }
     };
   }

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html?rev=1304978&r1=1304977&r2=1304978&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html Sun Mar 25 01:58:14 2012
@@ -55,6 +55,12 @@
         algorithm, as specified in 
         <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
         URLs and email addresses are also tokenized according to the relevant RFCs.
+        <br/>
+        <code><a href="UAX29URLEmailAnalyzer">UAX29URLEmailAnalyzer</a></code> includes
+        <code>UAX29URLEmailTokenizer</code>,
+        <code><a href="StandardFilter">StandardFilter</a></code>,
+        <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
+        and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
     </li>
 </ul>
 </body>

Copied: lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java (from r1304975, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java?p2=lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java&p1=lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java&r1=1304975&r2=1304978&rev=1304978&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java Sun Mar 25 01:58:14 2012
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.core;
+package org.apache.lucene.analysis;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,8 +17,6 @@ package org.apache.lucene.analysis.core;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer;
 import org.apache.lucene.util.Version;
 
@@ -259,7 +257,7 @@ public class TestUAX29URLEmailAnalyzer e
         new String[] { "<URL>", "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>", "<ALPHANUM>" });
   }
 
-  
+
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random, new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);