You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/03/25 03:58:15 UTC
svn commit: r1304978 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/core/src/ lucene/core/src/java/org/apache/lucene/analysis/standard/
lucene/core/src/test/org/apache/lucene/analysis/ solr/
Author: sarowe
Date: Sun Mar 25 01:58:14 2012
New Revision: 1304978
URL: http://svn.apache.org/viewvc?rev=1304978&view=rev
Log:
LUCENE-3881: Added UAX29URLEmailAnalyzer
Added:
lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
- copied, changed from r1304975, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java
- copied, changed from r1304975, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/core/src/ (props changed)
lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html
lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1304978&r1=1304977&r2=1304978&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Mar 25 01:58:14 2012
@@ -197,6 +197,9 @@ New Features
* LUCENE-3789: Expose MTQ TermsEnum via RewriteMethod for non package private
access (Simon Willnauer)
+* LUCENE-3881: Added UAX29URLEmailAnalyzer: a standard analyzer that recognizes
+ URLs and emails. (Steve Rowe)
+
Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
Copied: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java (from r1304975, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java?p2=lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java&p1=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java&r1=1304975&r2=1304978&rev=1304978&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java Sun Mar 25 01:58:14 2012
@@ -17,22 +17,18 @@ package org.apache.lucene.analysis.stand
* limitations under the License.
*/
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
+import java.util.Set;
/**
* Filters {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer}
* with {@link org.apache.lucene.analysis.standard.StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and
- * {@link org.apache.lucene.analysis.core.StopFilter}, using a list of
+ * {@link org.apache.lucene.analysis.LowerCaseFilter} and
+ * {@link org.apache.lucene.analysis.StopFilter}, using a list of
* English stop words.
*
* <a name="version"/>
@@ -50,13 +46,13 @@ public final class UAX29URLEmailAnalyzer
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
- public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */
- public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) {
+ public UAX29URLEmailAnalyzer(Version matchVersion, Set<?> stopWords) {
super(matchVersion, stopWords);
}
@@ -70,19 +66,19 @@ public final class UAX29URLEmailAnalyzer
}
/** Builds an analyzer with the stop words from the given reader.
- * @see org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version)
+ * @see org.apache.lucene.analysis.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public UAX29URLEmailAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, loadStopwordSet(stopwords, matchVersion));
+ this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
- * tokenStream is called.
+ * reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
@@ -104,9 +100,9 @@ public final class UAX29URLEmailAnalyzer
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
- protected void reset(final Reader reader) throws IOException {
+ protected boolean reset(final Reader reader) throws IOException {
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
- super.reset(reader);
+ return super.reset(reader);
}
};
}
Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html?rev=1304978&r1=1304977&r2=1304978&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/standard/package.html Sun Mar 25 01:58:14 2012
@@ -55,6 +55,12 @@
algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
URLs and email addresses are also tokenized according to the relevant RFCs.
+ <br/>
+ <code><a href="UAX29URLEmailAnalyzer">UAX29URLEmailAnalyzer</a></code> includes
+ <code>UAX29URLEmailTokenizer</code>,
+ <code><a href="StandardFilter">StandardFilter</a></code>,
+ <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
+ and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
</li>
</ul>
</body>
Copied: lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java (from r1304975, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java?p2=lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java&p1=lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java&r1=1304975&r2=1304978&rev=1304978&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/analysis/TestUAX29URLEmailAnalyzer.java Sun Mar 25 01:58:14 2012
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.core;
+package org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,8 +17,6 @@ package org.apache.lucene.analysis.core;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer;
import org.apache.lucene.util.Version;
@@ -259,7 +257,7 @@ public class TestUAX29URLEmailAnalyzer e
new String[] { "<URL>", "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>", "<ALPHANUM>" });
}
-
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);