You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/07 20:47:09 UTC
svn commit: r1043180 [1/3] - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/
lucene/src/java/org/apache/lucene/analysis/standard/
lucene/src/test/org/apache/lucene/analysis/ solr/ so...
Author: sarowe
Date: Tue Dec 7 19:47:08 2010
New Revision: 1043180
URL: http://svn.apache.org/viewvc?rev=1043180&view=rev
Log:
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer
Added:
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
- copied, changed from r1043071, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
- copied unchanged from r1043071, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
- copied, changed from r1043071, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
- copied unchanged from r1043071, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
- copied unchanged from r1043071, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Removed:
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java
lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/build.xml
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/package.html
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Tue Dec 7 19:47:08 2010
@@ -210,14 +210,16 @@ API Changes
opens internally when apply deletions or creating a near-real-time
reader. (Earwin Burrfoot via Mike McCandless)
-* LUCENE-2167: StandardTokenizer/Analyzer now implement the Word Break
- rules from the Unicode Text Segmentation algorithm (UAX#29), as well
- as tokenizing URLs and email addresses according to the relevant
- RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer /
- StandardAnalyzer behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
+* LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer now
+ implement the Word Break rules from the Unicode 6.0.0 Text Segmentation
+ algorithm (UAX#29).
+
+ ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
+ implementation and behavior.
-* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
- (Steven Rowe)
+ UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
+ relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
+ (Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-2778: RAMDirectory now exposes newRAMFile() which allows to override
and return a different RAMFile implementation. (Shai Erera)
Modified: lucene/dev/branches/branch_3x/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/build.xml?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/build.xml Tue Dec 7 19:47:08 2010
@@ -517,7 +517,7 @@
<!-- Build the JFlex files into the source tree -->
<!-- ================================================================== -->
- <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer" />
+ <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer" />
<target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@@ -532,11 +532,11 @@
nobak="on" />
</target>
- <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+ <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
- <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+ <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
</target>
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Tue Dec 7 19:47:08 2010
@@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends Ba
assertAnalyzesToReuse(
analyzer,
"à¸à¸£à¸´à¸©à¸±à¸à¸à¸·à¹à¸ XY&Z - à¸à¸¸à¸¢à¸à¸±à¸ xyz@demo.com",
- new String[] { "à¸à¸£à¸´à¸©à¸±à¸", "à¸à¸·à¹à¸", "xy", "z", "à¸à¸¸à¸¢", "à¸à¸±à¸", "xyz@demo.com" });
+ new String[] { "à¸à¸£à¸´à¸©à¸±à¸", "à¸à¸·à¹à¸", "xy", "z", "à¸à¸¸à¸¢", "à¸à¸±à¸", "xyz", "demo.com" });
}
/** @deprecated, for version back compat */
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Tue Dec 7 19:47:08 2010
@@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// file version from Tuesday, December 7, 2010 12:34:02 PM UTC
+// generated on Tuesday, December 7, 2010 4:53:37 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Tue Dec 7 19:47:08 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 10:48 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/7/10 11:53 AM */
package org.apache.lucene.analysis.standard;
@@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 10:48 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene_3xclean/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 12/7/10 11:53 AM from the specification file
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@@ -635,6 +635,12 @@ public final void getText(CharTermAttrib
zzState = ZZ_LEXSTATE[zzLexicalState];
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
zzForAction: {
while (true) {
@@ -667,7 +673,7 @@ public final void getText(CharTermAttrib
if (zzNext == -1) break zzForAction;
zzState = zzNext;
- int zzAttributes = zzAttrL[zzState];
+ zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
@@ -681,44 +687,44 @@ public final void getText(CharTermAttrib
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 10:
- { return EMAIL;
+ case 5:
+ { return NUM;
}
case 11: break;
- case 2:
- { return ALPHANUM;
+ case 9:
+ { return ACRONYM;
}
case 12: break;
- case 4:
- { return HOST;
+ case 7:
+ { return COMPANY;
}
case 13: break;
- case 1:
- { /* ignore */
+ case 10:
+ { return EMAIL;
}
case 14: break;
- case 8:
- { return ACRONYM_DEP;
+ case 1:
+ { /* ignore */
}
case 15: break;
- case 5:
- { return NUM;
+ case 6:
+ { return APOSTROPHE;
}
case 16: break;
- case 9:
- { return ACRONYM;
+ case 3:
+ { return CJ;
}
case 17: break;
- case 7:
- { return COMPANY;
+ case 8:
+ { return ACRONYM_DEP;
}
case 18: break;
- case 6:
- { return APOSTROPHE;
+ case 2:
+ { return ALPHANUM;
}
case 19: break;
- case 3:
- { return CJ;
+ case 4:
+ { return HOST;
}
case 20: break;
default:
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Tue Dec 7 19:47:08 2010
@@ -16,6 +16,6 @@
*/
-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
- the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
- SVN revision 597) at the moment!
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
+ and need to regenerate the tokenizer, only use the trunk version
+ of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Tue Dec 7 19:47:08 2010
@@ -86,10 +86,9 @@ public final class StandardTokenizer ext
@Deprecated
public static final int ACRONYM_DEP = 8;
- public static final int URL = 9;
- public static final int SOUTHEAST_ASIAN = 10;
- public static final int IDEOGRAPHIC = 11;
- public static final int HIRAGANA = 12;
+ public static final int SOUTHEAST_ASIAN = 9;
+ public static final int IDEOGRAPHIC = 10;
+ public static final int HIRAGANA = 11;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@@ -102,7 +101,6 @@ public final class StandardTokenizer ext
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>",
- "<URL>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>"