You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/07 15:53:14 UTC

svn commit: r1043071 [1/5] - in /lucene/dev/trunk: modules/analysis/ modules/analysis/common/ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ modules/analysis/common/src/test/org/apache/lucene/analysis/core/ modules/analysis/commo...

Author: sarowe
Date: Tue Dec  7 14:53:13 2010
New Revision: 1043071

URL: http://svn.apache.org/viewvc?rev=1043071&view=rev
Log:
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java   (contents, props changed)
      - copied, changed from r1042243, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex   (contents, props changed)
      - copied, changed from r1042243, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java   (contents, props changed)
      - copied, changed from r1042261, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java   (contents, props changed)
      - copied, changed from r1042243, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java   (contents, props changed)
      - copied, changed from r1042243, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Removed:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Modified:
    lucene/dev/trunk/modules/analysis/CHANGES.txt
    lucene/dev/trunk/modules/analysis/common/build.xml
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
    lucene/dev/trunk/solr/CHANGES.txt

Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Tue Dec  7 14:53:13 2010
@@ -9,15 +9,17 @@ API Changes
 
  * LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous.  (Robert Muir)
 
- * LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
-   the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
-   as well as tokenizing URLs and email addresses according to the relevant
-   RFCs.  ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
-   behavior.  (Steven Rowe, Robert Muir, Uwe Schindler)
-
- * LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
-   (Steven Rowe)
+ * LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in 
+   common/standard/ now implement the Word Break rules from the Unicode 6.0.0
+   Text Segmentation algorithm (UAX#29).  
    
+   ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
+   implementation and behavior.
+
+   UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
+   relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
+   (Steven Rowe, Robert Muir, Uwe Schindler)
+
  * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
    can be generated. (Chris Harris via Steven Rowe)
    

Modified: lucene/dev/trunk/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/build.xml?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/common/build.xml Tue Dec  7 14:53:13 2010
@@ -38,7 +38,7 @@
 
   <target name="compile-core" depends="jflex-notice, common.compile-core"/>
 
-  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
+  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
 
   <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@@ -62,11 +62,11 @@
            nobak="on" />
   </target>
 
-  <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+  <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
 			<classpath refid="jflex.classpath"/>
     </taskdef>
-    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
            outdir="src/java/org/apache/lucene/analysis/standard"
            nobak="on" />
   </target>

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Tue Dec  7 14:53:13 2010
@@ -15,8 +15,8 @@
  */
 
 // Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// file version from Saturday, December 4, 2010 12:34:19 PM UTC
+// generated on Sunday, December 5, 2010 12:24:12 AM UTC
 // by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
 
 ASCIITLD = "." (

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Tue Dec  7 14:53:13 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
 
 package org.apache.lucene.analysis.standard;
 
@@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokena
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 12/4/10 7:24 PM from the specification file
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
  */
 class ClassicTokenizerImpl implements StandardTokenizerInterface {
 
@@ -630,6 +630,12 @@ public final void getText(CharTermAttrib
   
       zzState = ZZ_LEXSTATE[zzLexicalState];
 
+      // set up zzAction for empty match case:
+      int zzAttributes = zzAttrL[zzState];
+      if ( (zzAttributes & 1) == 1 ) {
+        zzAction = zzState;
+      }
+
 
       zzForAction: {
         while (true) {
@@ -662,7 +668,7 @@ public final void getText(CharTermAttrib
           if (zzNext == -1) break zzForAction;
           zzState = zzNext;
 
-          int zzAttributes = zzAttrL[zzState];
+          zzAttributes = zzAttrL[zzState];
           if ( (zzAttributes & 1) == 1 ) {
             zzAction = zzState;
             zzMarkedPosL = zzCurrentPosL;
@@ -676,44 +682,44 @@ public final void getText(CharTermAttrib
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 10: 
-          { return EMAIL;
+        case 5: 
+          { return NUM;
           }
         case 11: break;
-        case 2: 
-          { return ALPHANUM;
+        case 9: 
+          { return ACRONYM;
           }
         case 12: break;
-        case 4: 
-          { return HOST;
+        case 7: 
+          { return COMPANY;
           }
         case 13: break;
-        case 1: 
-          { /* ignore */
+        case 10: 
+          { return EMAIL;
           }
         case 14: break;
-        case 8: 
-          { return ACRONYM_DEP;
+        case 1: 
+          { /* ignore */
           }
         case 15: break;
-        case 5: 
-          { return NUM;
+        case 6: 
+          { return APOSTROPHE;
           }
         case 16: break;
-        case 9: 
-          { return ACRONYM;
+        case 3: 
+          { return CJ;
           }
         case 17: break;
-        case 7: 
-          { return COMPANY;
+        case 8: 
+          { return ACRONYM_DEP;
           }
         case 18: break;
-        case 6: 
-          { return APOSTROPHE;
+        case 2: 
+          { return ALPHANUM;
           }
         case 19: break;
-        case 3: 
-          { return CJ;
+        case 4: 
+          { return HOST;
           }
         case 20: break;
         default: 

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Tue Dec  7 14:53:13 2010
@@ -16,6 +16,6 @@
 */
 
 
-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
-      the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
-      SVN revision 597) at the moment!
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
+      and need to regenerate the tokenizer, only use the trunk version
+      of JFlex 1.5 (with a minimum SVN revision 597) at the moment!

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Tue Dec  7 14:53:13 2010
@@ -83,10 +83,9 @@ public final class StandardTokenizer ext
   @Deprecated
   public static final int ACRONYM_DEP       = 8;
 
-  public static final int URL = 9;
-  public static final int SOUTHEAST_ASIAN = 10;
-  public static final int IDEOGRAPHIC = 11;
-  public static final int HIRAGANA = 12;
+  public static final int SOUTHEAST_ASIAN = 9;
+  public static final int IDEOGRAPHIC = 10;
+  public static final int HIRAGANA = 11;
   
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -99,7 +98,6 @@ public final class StandardTokenizer ext
     "<NUM>",
     "<CJ>",
     "<ACRONYM_DEP>",
-    "<URL>",
     "<SOUTHEAST_ASIAN>",
     "<IDEOGRAPHIC>",
     "<HIRAGANA>"