You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/15 07:41:55 UTC
svn commit: r1022826 [1/3] - in /lucene/dev/trunk/modules/analysis: ./
common/src/java/org/apache/lucene/analysis/standard/
common/src/test/org/apache/lucene/analysis/core/
Author: sarowe
Date: Fri Oct 15 05:41:54 2010
New Revision: 1022826
URL: http://svn.apache.org/viewvc?rev=1022826&view=rev
Log:
LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0
Added:
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/WordBreakTestUnicode_6_0_0.java (with props)
Removed:
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/WordBreakTestUnicode_5_2_0.java
Modified:
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/generateJavaUnicodeWordBreakTest.pl
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Fri Oct 15 05:41:54 2010
@@ -15,6 +15,9 @@ API Changes
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
+ * LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
+ (Steven Rowe)
+
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Fri Oct 15 05:41:54 2010
@@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Sunday, October 3, 2010 11:34:02 AM UTC
-// generated on Sunday, October 3, 2010 1:07:42 PM UTC
+// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
+// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Fri Oct 15 05:41:54 2010
@@ -18,4 +18,4 @@
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
- SVN revision 591) at the moment!
+ SVN revision 597) at the moment!
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Fri Oct 15 05:41:54 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
package org.apache.lucene.analysis.standard;
@@ -87,107 +87,109 @@ public final class StandardTokenizerImpl
"\37\1\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0"+
"\1\1\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0"+
"\1\1\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0"+
- "\213\1\1\0\7\2\234\1\13\0\46\1\2\0\1\1\7\0\47\1"+
+ "\213\1\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1"+
"\1\0\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2"+
"\1\0\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0"+
- "\2\6\2\0\13\2\6\0\52\1\24\2\1\0\12\3\1\0\1\3"+
- "\1\6\1\0\2\1\1\2\143\1\1\0\1\1\17\2\2\1\2\2"+
- "\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
- "\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
- "\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
- "\11\2\1\1\3\2\1\1\5\2\322\0\4\2\66\1\2\0\1\2"+
- "\1\1\21\2\1\0\1\1\5\2\2\0\12\1\2\2\2\0\12\3"+
- "\1\0\2\1\6\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
- "\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
- "\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
- "\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
- "\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
- "\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
- "\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
- "\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
- "\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
- "\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
- "\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
- "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
- "\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
- "\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
- "\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
- "\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
- "\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
- "\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
- "\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
- "\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
- "\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
- "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
- "\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1"+
- "\1\0\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2\1\0"+
- "\4\2\11\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0\6\1"+
- "\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1"+
- "\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2"+
- "\22\0\2\2\15\0\60\105\1\106\2\105\7\106\5\0\7\105\10\106"+
- "\1\0\12\3\47\0\2\105\1\0\1\105\2\0\2\105\1\0\1\105"+
- "\2\0\1\105\6\0\4\105\1\0\7\105\1\0\3\105\1\0\1\105"+
- "\1\0\1\105\2\0\2\105\1\0\4\105\1\106\2\105\6\106\1\0"+
- "\2\106\1\105\2\0\5\105\1\0\1\105\1\0\6\106\2\0\12\3"+
- "\2\0\2\105\42\0\1\1\27\0\2\2\6\0\12\3\13\0\1\2"+
- "\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0"+
- "\24\2\1\0\2\2\4\1\4\0\10\2\1\0\44\2\11\0\1\2"+
- "\71\0\53\105\24\106\1\105\12\3\6\0\6\105\4\106\4\105\3\106"+
- "\1\105\3\106\2\105\7\106\3\105\4\106\15\105\14\106\1\105\1\106"+
- "\12\3\4\106\2\105\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
- "\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
- "\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
- "\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
- "\4\0\1\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
- "\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
- "\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
- "\3\1\1\0\2\2\14\0\64\105\40\106\3\0\1\105\4\0\1\105"+
- "\1\106\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
- "\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
- "\14\2\12\0\12\3\36\105\2\0\5\105\13\0\54\105\4\0\21\106"+
- "\7\105\2\106\6\0\13\3\3\0\2\105\40\0\27\1\5\2\4\0"+
- "\65\105\12\106\1\0\35\106\2\0\1\2\12\3\6\0\12\3\6\0"+
- "\16\105\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0\11\2"+
- "\14\0\3\2\36\1\12\2\3\0\2\1\12\3\106\0\44\1\24\2"+
- "\10\0\12\3\3\0\3\1\12\3\44\1\122\0\3\2\1\0\25\2"+
- "\4\1\1\2\4\1\1\2\15\0\300\1\47\2\26\0\3\2\u0116\1"+
- "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
- "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
- "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
- "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\2\10\0\2\7"+
- "\12\0\1\7\2\0\1\5\2\0\5\2\20\0\2\10\3\0\1\6"+
- "\17\0\1\10\13\0\5\2\5\0\6\2\1\0\1\1\15\0\1\1"+
- "\20\0\5\1\73\0\41\2\21\0\1\1\4\0\1\1\2\0\12\1"+
- "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
- "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
- "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
- "\6\0\4\1\3\2\16\0\46\1\12\0\66\1\11\0\1\1\20\0"+
- "\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\57\0"+
- "\1\1\120\0\32\107\1\0\131\107\14\0\326\107\57\0\1\1\1\0"+
- "\1\107\31\0\11\107\6\2\1\0\5\4\2\0\3\107\1\1\1\1"+
- "\4\0\126\110\2\0\2\2\2\4\3\110\133\4\1\0\4\4\5\0"+
- "\51\1\3\0\136\1\21\0\30\1\70\0\20\4\320\0\57\4\1\0"+
- "\130\4\250\0\u19b6\107\112\0\u51cc\107\64\0\u048d\1\103\0\56\1\2\0"+
- "\u010d\1\3\0\20\1\12\3\2\1\24\0\40\1\2\0\15\1\4\2"+
- "\11\0\2\2\1\0\31\1\10\0\120\1\2\2\45\0\11\1\2\0"+
- "\147\1\2\0\2\1\156\0\7\1\1\2\3\1\1\2\4\1\1\2"+
- "\27\1\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\3"+
- "\6\0\22\2\6\1\3\0\1\1\4\0\12\3\34\1\10\2\2\0"+
- "\27\1\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1"+
- "\12\3\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0"+
- "\12\3\6\0\33\105\1\106\4\0\60\105\1\106\1\105\3\106\2\105"+
- "\2\106\5\105\2\106\1\105\1\106\1\105\30\0\5\105\340\0\43\1"+
- "\10\2\1\0\2\2\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0"+
- "\61\1\u2104\0\u012e\107\2\0\76\107\2\0\152\107\46\0\7\1\14\0"+
- "\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0\5\1\1\0"+
- "\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0"+
- "\100\1\2\0\66\1\50\0\14\1\4\0\20\2\1\6\2\0\1\5"+
- "\1\6\13\0\7\2\14\0\2\10\30\0\3\10\1\6\1\0\1\7"+
- "\1\0\1\6\1\5\32\0\5\1\1\0\207\1\2\0\1\2\7\0"+
- "\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6\5\0"+
- "\32\1\4\0\1\10\1\0\32\1\13\0\70\4\2\2\37\1\3\0"+
- "\6\1\2\0\6\1\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
+ "\2\6\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6"+
+ "\1\0\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1"+
+ "\2\2\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2"+
+ "\1\1\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3"+
+ "\41\1\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2"+
+ "\1\1\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0"+
+ "\4\2\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0"+
+ "\12\3\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0"+
+ "\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0"+
+ "\1\2\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2"+
+ "\4\0\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2"+
+ "\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1"+
+ "\1\0\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2"+
+ "\2\0\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3"+
+ "\2\2\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0"+
+ "\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1"+
+ "\10\2\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2"+
+ "\2\0\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1"+
+ "\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2"+
+ "\2\0\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1"+
+ "\2\2\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1"+
+ "\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1"+
+ "\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2"+
+ "\1\0\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2"+
+ "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
+ "\3\0\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0"+
+ "\2\1\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1"+
+ "\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2"+
+ "\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1"+
+ "\1\0\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0"+
+ "\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2"+
+ "\1\0\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3"+
+ "\12\0\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1"+
+ "\1\0\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2"+
+ "\1\0\10\2\22\0\2\2\15\0\60\105\1\106\2\105\7\106\5\0"+
+ "\7\105\10\106\1\0\12\3\47\0\2\105\1\0\1\105\2\0\2\105"+
+ "\1\0\1\105\2\0\1\105\6\0\4\105\1\0\7\105\1\0\3\105"+
+ "\1\0\1\105\1\0\1\105\2\0\2\105\1\0\4\105\1\106\2\105"+
+ "\6\106\1\0\2\106\1\105\2\0\5\105\1\0\1\105\1\0\6\106"+
+ "\2\0\12\3\2\0\2\105\42\0\1\1\27\0\2\2\6\0\12\3"+
+ "\13\0\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0"+
+ "\44\1\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0"+
+ "\1\2\71\0\53\105\24\106\1\105\12\3\6\0\6\105\4\106\4\105"+
+ "\3\106\1\105\3\106\2\105\7\106\3\105\4\106\15\105\14\106\1\105"+
+ "\1\106\12\3\4\106\2\105\46\1\12\0\53\1\1\0\1\1\3\0"+
+ "\u0149\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
+ "\51\1\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0"+
+ "\1\1\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0"+
+ "\103\1\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0"+
+ "\21\1\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
+ "\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
+ "\1\0\3\1\1\0\2\2\14\0\64\105\40\106\3\0\1\105\4\0"+
+ "\1\105\1\106\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1"+
+ "\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
+ "\4\0\14\2\12\0\12\3\36\105\2\0\5\105\13\0\54\105\4\0"+
+ "\21\106\7\105\2\106\6\0\12\3\1\105\3\0\2\105\40\0\27\1"+
+ "\5\2\4\0\65\105\12\106\1\0\35\106\2\0\1\2\12\3\6\0"+
+ "\12\3\6\0\16\105\122\0\5\2\57\1\21\2\7\1\4\0\12\3"+
+ "\21\0\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0"+
+ "\46\1\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3"+
+ "\44\1\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0"+
+ "\300\1\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0"+
+ "\6\1\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0"+
+ "\37\1\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0"+
+ "\7\1\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0"+
+ "\7\1\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0"+
+ "\5\2\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0"+
+ "\6\2\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0"+
+ "\1\1\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0"+
+ "\1\1\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0"+
+ "\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0"+
+ "\57\1\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1"+
+ "\12\0\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\107\1\0"+
+ "\131\107\14\0\326\107\57\0\1\1\1\0\1\107\31\0\11\107\6\2"+
+ "\1\0\5\4\2\0\3\107\1\1\1\1\4\0\126\110\2\0\2\2"+
+ "\2\4\3\110\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0"+
+ "\33\1\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\107\112\0"+
+ "\u51cc\107\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3"+
+ "\2\1\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1"+
+ "\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0"+
+ "\12\1\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2"+
+ "\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2"+
+ "\6\1\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2"+
+ "\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0"+
+ "\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0"+
+ "\33\105\1\106\4\0\60\105\1\106\1\105\3\106\2\105\2\106\5\105"+
+ "\2\106\1\105\1\106\1\105\30\0\5\105\41\0\6\1\2\0\6\1"+
+ "\2\0\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0"+
+ "\2\2\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0"+
+ "\u012e\107\2\0\76\107\2\0\152\107\46\0\7\1\14\0\5\1\5\0"+
+ "\1\1\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0"+
+ "\2\1\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0"+
+ "\66\1\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0"+
+ "\7\2\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6"+
+ "\1\5\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0"+
+ "\1\6\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0"+
+ "\1\10\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0"+
+ "\6\1\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
/**
* Translates characters to character classes
@@ -2440,7 +2442,7 @@ public final class StandardTokenizerImpl
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
- while (i < 2300) {
+ while (i < 2336) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@@ -2713,36 +2715,36 @@ public final class StandardTokenizerImpl
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 2:
- { return WORD_TYPE;
+ case 1:
+ { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 9: break;
- case 4:
- { return SOUTH_EAST_ASIAN_TYPE;
+ case 6:
+ { return HIRAGANA_TYPE;
}
case 10: break;
- case 8:
- { return URL_TYPE;
+ case 2:
+ { return WORD_TYPE;
}
case 11: break;
- case 7:
- { return EMAIL_TYPE;
+ case 8:
+ { return URL_TYPE;
}
case 12: break;
case 5:
{ return IDEOGRAPHIC_TYPE;
}
case 13: break;
- case 1:
- { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ case 7:
+ { return EMAIL_TYPE;
}
case 14: break;
case 3:
{ return NUMERIC_TYPE;
}
case 15: break;
- case 6:
- { return HIRAGANA_TYPE;
+ case 4:
+ { return SOUTH_EAST_ASIAN_TYPE;
}
case 16: break;
default:
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Fri Oct 15 05:41:54 2010
@@ -47,7 +47,7 @@ import org.apache.lucene.analysis.tokena
*/
%%
-%unicode 5.2
+%unicode 6.0
%integer
%final
%public
@@ -234,7 +234,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
-// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java Fri Oct 15 05:41:54 2010
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
package org.apache.lucene.analysis.standard;
@@ -85,107 +85,109 @@ public final class UAX29Tokenizer extend
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
- "\1\0\7\2\234\1\13\0\46\1\2\0\1\1\7\0\47\1\1\0"+
+ "\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
- "\2\0\13\2\6\0\52\1\24\2\1\0\12\3\1\0\1\3\1\6"+
- "\1\0\2\1\1\2\143\1\1\0\1\1\17\2\2\1\2\2\1\0"+
- "\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1\1\2"+
- "\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1\11\2"+
- "\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1\11\2"+
- "\1\1\3\2\1\1\5\2\322\0\4\2\66\1\2\0\1\2\1\1"+
- "\21\2\1\0\1\1\5\2\2\0\12\1\2\2\2\0\12\3\1\0"+
- "\2\1\6\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0"+
- "\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1"+
- "\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1"+
- "\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0\6\1"+
- "\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1"+
- "\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2"+
- "\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2\3\1"+
- "\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0"+
- "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0"+
- "\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\3"+
- "\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
- "\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
- "\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0"+
- "\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1"+
- "\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1"+
- "\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2"+
- "\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0\10\1"+
- "\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1"+
- "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0"+
- "\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1"+
- "\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2"+
- "\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1"+
- "\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1\1\0"+
- "\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2\1\0\4\2"+
- "\11\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0\6\1\2\0"+
- "\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
- "\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0"+
- "\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11\10\12\1\0"+
- "\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0\1\11\2\0"+
- "\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0\1\11\1\0"+
- "\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12\1\0\2\12"+
- "\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0\12\3\2\0"+
- "\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0\1\2\1\0"+
- "\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2"+
- "\1\0\2\2\4\1\4\0\10\2\1\0\44\2\11\0\1\2\71\0"+
- "\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12\1\11"+
- "\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12\12\3"+
- "\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1\1\0"+
- "\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0"+
- "\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0"+
- "\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1\4\0"+
- "\1\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0"+
- "\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\2"+
- "\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0\3\1"+
- "\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11\1\12"+
- "\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0\51\1"+
- "\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0\14\2"+
- "\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12\7\11"+
- "\2\12\6\0\13\3\3\0\2\11\40\0\27\1\5\2\4\0\65\11"+
- "\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3\6\0\16\11"+
- "\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0\11\2\14\0"+
- "\3\2\36\1\12\2\3\0\2\1\12\3\106\0\44\1\24\2\10\0"+
- "\12\3\3\0\3\1\12\3\44\1\122\0\3\2\1\0\25\2\4\1"+
- "\1\2\4\1\1\2\15\0\300\1\47\2\26\0\3\2\u0116\1\2\0"+
- "\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1\1\0"+
- "\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1\1\0"+
- "\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1\4\0"+
- "\15\1\5\0\3\1\1\0\7\1\17\0\4\2\10\0\2\7\12\0"+
- "\1\7\2\0\1\5\2\0\5\2\20\0\2\10\3\0\1\6\17\0"+
- "\1\10\13\0\5\2\5\0\6\2\1\0\1\1\15\0\1\1\20\0"+
- "\5\1\73\0\41\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
- "\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
- "\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0"+
- "\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1\6\0"+
- "\4\1\3\2\16\0\46\1\12\0\66\1\11\0\1\1\20\0\27\1"+
- "\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
- "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\57\0\1\1"+
- "\120\0\32\13\1\0\131\13\14\0\326\13\57\0\1\1\1\0\1\13"+
- "\31\0\11\13\6\2\1\0\5\4\2\0\3\13\1\1\1\1\4\0"+
- "\126\14\2\0\2\2\2\4\3\14\133\4\1\0\4\4\5\0\51\1"+
- "\3\0\136\1\21\0\30\1\70\0\20\4\320\0\57\4\1\0\130\4"+
- "\250\0\u19b6\13\112\0\u51cc\13\64\0\u048d\1\103\0\56\1\2\0\u010d\1"+
- "\3\0\20\1\12\3\2\1\24\0\40\1\2\0\15\1\4\2\11\0"+
- "\2\2\1\0\31\1\10\0\120\1\2\2\45\0\11\1\2\0\147\1"+
- "\2\0\2\1\156\0\7\1\1\2\3\1\1\2\4\1\1\2\27\1"+
- "\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0"+
- "\22\2\6\1\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1"+
- "\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3"+
- "\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3"+
- "\6\0\33\11\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12"+
- "\5\11\2\12\1\11\1\12\1\11\30\0\5\11\340\0\43\1\10\2"+
- "\1\0\2\2\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1"+
- "\u2104\0\u012e\13\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1"+
- "\5\0\1\1\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1"+
- "\1\0\2\1\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1"+
- "\2\0\66\1\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6"+
- "\13\0\7\2\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0"+
- "\1\6\1\5\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7"+
- "\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1"+
- "\4\0\1\10\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1"+
- "\2\0\6\1\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
+ "\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
+ "\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
+ "\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
+ "\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
+ "\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
+ "\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
+ "\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
+ "\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
+ "\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
+ "\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
+ "\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
+ "\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
+ "\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
+ "\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
+ "\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
+ "\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
+ "\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
+ "\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
+ "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
+ "\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
+ "\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
+ "\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
+ "\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
+ "\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
+ "\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
+ "\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
+ "\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
+ "\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
+ "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
+ "\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
+ "\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
+ "\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
+ "\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
+ "\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
+ "\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
+ "\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
+ "\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
+ "\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
+ "\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
+ "\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
+ "\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
+ "\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
+ "\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
+ "\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
+ "\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
+ "\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
+ "\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
+ "\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
+ "\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
+ "\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
+ "\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
+ "\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
+ "\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
+ "\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
+ "\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
+ "\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
+ "\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
+ "\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
+ "\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
+ "\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
+ "\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
+ "\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
+ "\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
+ "\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
+ "\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
+ "\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
+ "\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
+ "\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
+ "\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
+ "\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
+ "\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
+ "\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
+ "\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
+ "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
+ "\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
+ "\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
+ "\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
+ "\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
+ "\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
+ "\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
+ "\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
+ "\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
+ "\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
+ "\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
+ "\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
+ "\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
+ "\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
+ "\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
+ "\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
+ "\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
+ "\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
+ "\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
+ "\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
+ "\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
+ "\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
+ "\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
+ "\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
+ "\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
+ "\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
+ "\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
/**
* Translates characters to character classes
@@ -530,7 +532,7 @@ public final class UAX29Tokenizer extend
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
- while (i < 2138) {
+ while (i < 2174) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@@ -803,28 +805,28 @@ public final class UAX29Tokenizer extend
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 2:
- { if (populateAttributes(WORD_TYPE)) return true;
+ case 5:
+ { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
}
case 7: break;
- case 6:
- { if (populateAttributes(HIRAGANA_TYPE)) return true;
+ case 1:
+ { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 8: break;
- case 5:
- { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
+ case 3:
+ { if (populateAttributes(NUMERIC_TYPE)) return true;
}
case 9: break;
- case 4:
- { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
+ case 6:
+ { if (populateAttributes(HIRAGANA_TYPE)) return true;
}
case 10: break;
- case 3:
- { if (populateAttributes(NUMERIC_TYPE)) return true;
+ case 4:
+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
}
case 11: break;
- case 1:
- { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ case 2:
+ { if (populateAttributes(WORD_TYPE)) return true;
}
case 12: break;
default:
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex Fri Oct 15 05:41:54 2010
@@ -53,7 +53,7 @@ import org.apache.lucene.util.AttributeS
*/
%%
-%unicode 5.2
+%unicode 6.0
%final
%public
%apiprivate
@@ -247,7 +247,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
-// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Fri Oct 15 05:41:54 2010
@@ -394,7 +394,7 @@ public class TestStandardAnalyzer extend
}
public void testUnicodeWordBreaks() throws Exception {
- WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
+ WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java?rev=1022826&r1=1022825&r2=1022826&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java Fri Oct 15 05:41:54 2010
@@ -198,7 +198,7 @@ public class TestUAX29Tokenizer extends
}
public void testUnicodeWordBreaks() throws Exception {
- WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
+ WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
}