You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/07 20:47:09 UTC

svn commit: r1043180 [1/3] - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/ lucene/src/java/org/apache/lucene/analysis/standard/ lucene/src/test/org/apache/lucene/analysis/ solr/ so...

Author: sarowe
Date: Tue Dec  7 19:47:08 2010
New Revision: 1043180

URL: http://svn.apache.org/viewvc?rev=1043180&view=rev
Log:
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

Added:
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
      - copied, changed from r1043071, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
      - copied unchanged from r1043071, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
      - copied, changed from r1043071, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
      - copied unchanged from r1043071, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
    lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
      - copied unchanged from r1043071, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Removed:
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
    lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/build.xml
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/package.html
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/CHANGES.txt

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Tue Dec  7 19:47:08 2010
@@ -210,14 +210,16 @@ API Changes
   opens internally when apply deletions or creating a near-real-time
   reader.  (Earwin Burrfoot via Mike McCandless)
 
-* LUCENE-2167: StandardTokenizer/Analyzer now implement the Word Break
-  rules from the Unicode Text Segmentation algorithm (UAX#29), as well
-  as tokenizing URLs and email addresses according to the relevant
-  RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer /
-  StandardAnalyzer behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
+* LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer now 
+  implement the Word Break rules from the Unicode 6.0.0 Text Segmentation 
+  algorithm (UAX#29).  
+   
+  ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
+  implementation and behavior.
 
-* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
-  (Steven Rowe)
+  UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
+  relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
+  (Steven Rowe, Robert Muir, Uwe Schindler)
    
 * LUCENE-2778: RAMDirectory now exposes newRAMFile() which allows to override
   and return a different RAMFile implementation. (Shai Erera)

Modified: lucene/dev/branches/branch_3x/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/build.xml?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/build.xml Tue Dec  7 19:47:08 2010
@@ -517,7 +517,7 @@
   <!-- Build the JFlex files into the source tree                         -->
   <!-- ================================================================== -->
 
-  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer" />
+  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer" />
 
   <target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@@ -532,11 +532,11 @@
            nobak="on" />
   </target>
 
-  <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+  <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
 			<classpath refid="jflex.classpath"/>
     </taskdef>
-    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
            outdir="src/java/org/apache/lucene/analysis/standard"
            nobak="on" />
   </target>

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Tue Dec  7 19:47:08 2010
@@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends Ba
       assertAnalyzesToReuse(
           analyzer,
           "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
-          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
+          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
 	}
 	
 	/** @deprecated, for version back compat */

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Tue Dec  7 19:47:08 2010
@@ -15,8 +15,8 @@
  */
 
 // Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// file version from Tuesday, December 7, 2010 12:34:02 PM UTC
+// generated on Tuesday, December 7, 2010 4:53:37 PM UTC
 // by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
 
 ASCIITLD = "." (

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Tue Dec  7 19:47:08 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 10:48 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/7/10 11:53 AM */
 
 package org.apache.lucene.analysis.standard;
 
@@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokena
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 10:48 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene_3xclean/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 12/7/10 11:53 AM from the specification file
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
  */
 class ClassicTokenizerImpl implements StandardTokenizerInterface {
 
@@ -635,6 +635,12 @@ public final void getText(CharTermAttrib
   
       zzState = ZZ_LEXSTATE[zzLexicalState];
 
+      // set up zzAction for empty match case:
+      int zzAttributes = zzAttrL[zzState];
+      if ( (zzAttributes & 1) == 1 ) {
+        zzAction = zzState;
+      }
+
 
       zzForAction: {
         while (true) {
@@ -667,7 +673,7 @@ public final void getText(CharTermAttrib
           if (zzNext == -1) break zzForAction;
           zzState = zzNext;
 
-          int zzAttributes = zzAttrL[zzState];
+          zzAttributes = zzAttrL[zzState];
           if ( (zzAttributes & 1) == 1 ) {
             zzAction = zzState;
             zzMarkedPosL = zzCurrentPosL;
@@ -681,44 +687,44 @@ public final void getText(CharTermAttrib
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 10: 
-          { return EMAIL;
+        case 5: 
+          { return NUM;
           }
         case 11: break;
-        case 2: 
-          { return ALPHANUM;
+        case 9: 
+          { return ACRONYM;
           }
         case 12: break;
-        case 4: 
-          { return HOST;
+        case 7: 
+          { return COMPANY;
           }
         case 13: break;
-        case 1: 
-          { /* ignore */
+        case 10: 
+          { return EMAIL;
           }
         case 14: break;
-        case 8: 
-          { return ACRONYM_DEP;
+        case 1: 
+          { /* ignore */
           }
         case 15: break;
-        case 5: 
-          { return NUM;
+        case 6: 
+          { return APOSTROPHE;
           }
         case 16: break;
-        case 9: 
-          { return ACRONYM;
+        case 3: 
+          { return CJ;
           }
         case 17: break;
-        case 7: 
-          { return COMPANY;
+        case 8: 
+          { return ACRONYM_DEP;
           }
         case 18: break;
-        case 6: 
-          { return APOSTROPHE;
+        case 2: 
+          { return ALPHANUM;
           }
         case 19: break;
-        case 3: 
-          { return CJ;
+        case 4: 
+          { return HOST;
           }
         case 20: break;
         default: 

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Tue Dec  7 19:47:08 2010
@@ -16,6 +16,6 @@
 */
 
 
-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
-      the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
-      SVN revision 597) at the moment!
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
+      and need to regenerate the tokenizer, only use the trunk version
+      of JFlex 1.5 (with a minimum SVN revision 597) at the moment!

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1043180&r1=1043179&r2=1043180&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Tue Dec  7 19:47:08 2010
@@ -86,10 +86,9 @@ public final class StandardTokenizer ext
   @Deprecated
   public static final int ACRONYM_DEP       = 8;
 
-  public static final int URL = 9;
-  public static final int SOUTHEAST_ASIAN = 10;
-  public static final int IDEOGRAPHIC = 11;
-  public static final int HIRAGANA = 12;
+  public static final int SOUTHEAST_ASIAN = 9;
+  public static final int IDEOGRAPHIC = 10;
+  public static final int HIRAGANA = 11;
   
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -102,7 +101,6 @@ public final class StandardTokenizer ext
     "<NUM>",
     "<CJ>",
     "<ACRONYM_DEP>",
-    "<URL>",
     "<SOUTHEAST_ASIAN>",
     "<IDEOGRAPHIC>",
     "<HIRAGANA>"