You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/07 15:53:14 UTC
svn commit: r1043071 [1/5] - in /lucene/dev/trunk: modules/analysis/
modules/analysis/common/
modules/analysis/common/src/java/org/apache/lucene/analysis/standard/
modules/analysis/common/src/test/org/apache/lucene/analysis/core/
modules/analysis/commo...
Author: sarowe
Date: Tue Dec 7 14:53:13 2010
New Revision: 1043071
URL: http://svn.apache.org/viewvc?rev=1043071&view=rev
Log:
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (contents, props changed)
- copied, changed from r1042243, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (contents, props changed)
- copied, changed from r1042243, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (contents, props changed)
- copied, changed from r1042261, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java (contents, props changed)
- copied, changed from r1042243, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java (contents, props changed)
- copied, changed from r1042243, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Removed:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Modified:
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/build.xml
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
lucene/dev/trunk/solr/CHANGES.txt
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Tue Dec 7 14:53:13 2010
@@ -9,15 +9,17 @@ API Changes
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
- * LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
- the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
- as well as tokenizing URLs and email addresses according to the relevant
- RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
- behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
-
- * LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
- (Steven Rowe)
+ * LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in
+ common/standard/ now implement the Word Break rules from the Unicode 6.0.0
+ Text Segmentation algorithm (UAX#29).
+ ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
+ implementation and behavior.
+
+ UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
+ relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
+ (Steven Rowe, Robert Muir, Uwe Schindler)
+
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
Modified: lucene/dev/trunk/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/build.xml?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/common/build.xml Tue Dec 7 14:53:13 2010
@@ -38,7 +38,7 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
- <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
+ <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@@ -62,11 +62,11 @@
nobak="on" />
</target>
- <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+ <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
- <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+ <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
</target>
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Tue Dec 7 14:53:13 2010
@@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// file version from Saturday, December 4, 2010 12:34:19 PM UTC
+// generated on Sunday, December 5, 2010 12:24:12 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Tue Dec 7 14:53:13 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
package org.apache.lucene.analysis.standard;
@@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 12/4/10 7:24 PM from the specification file
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@@ -630,6 +630,12 @@ public final void getText(CharTermAttrib
zzState = ZZ_LEXSTATE[zzLexicalState];
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
zzForAction: {
while (true) {
@@ -662,7 +668,7 @@ public final void getText(CharTermAttrib
if (zzNext == -1) break zzForAction;
zzState = zzNext;
- int zzAttributes = zzAttrL[zzState];
+ zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
@@ -676,44 +682,44 @@ public final void getText(CharTermAttrib
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 10:
- { return EMAIL;
+ case 5:
+ { return NUM;
}
case 11: break;
- case 2:
- { return ALPHANUM;
+ case 9:
+ { return ACRONYM;
}
case 12: break;
- case 4:
- { return HOST;
+ case 7:
+ { return COMPANY;
}
case 13: break;
- case 1:
- { /* ignore */
+ case 10:
+ { return EMAIL;
}
case 14: break;
- case 8:
- { return ACRONYM_DEP;
+ case 1:
+ { /* ignore */
}
case 15: break;
- case 5:
- { return NUM;
+ case 6:
+ { return APOSTROPHE;
}
case 16: break;
- case 9:
- { return ACRONYM;
+ case 3:
+ { return CJ;
}
case 17: break;
- case 7:
- { return COMPANY;
+ case 8:
+ { return ACRONYM_DEP;
}
case 18: break;
- case 6:
- { return APOSTROPHE;
+ case 2:
+ { return ALPHANUM;
}
case 19: break;
- case 3:
- { return CJ;
+ case 4:
+ { return HOST;
}
case 20: break;
default:
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Tue Dec 7 14:53:13 2010
@@ -16,6 +16,6 @@
*/
-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
- the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
- SVN revision 597) at the moment!
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
+ and need to regenerate the tokenizer, only use the trunk version
+ of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Tue Dec 7 14:53:13 2010
@@ -83,10 +83,9 @@ public final class StandardTokenizer ext
@Deprecated
public static final int ACRONYM_DEP = 8;
- public static final int URL = 9;
- public static final int SOUTHEAST_ASIAN = 10;
- public static final int IDEOGRAPHIC = 11;
- public static final int HIRAGANA = 12;
+ public static final int SOUTHEAST_ASIAN = 9;
+ public static final int IDEOGRAPHIC = 10;
+ public static final int HIRAGANA = 11;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@@ -99,7 +98,6 @@ public final class StandardTokenizer ext
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>",
- "<URL>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>"