You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by eh...@apache.org on 2003/09/30 18:31:49 UTC
cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/standard StandardTokenizer.jj
ehatcher 2003/09/30 09:31:49
Modified: src/java/org/apache/lucene/analysis/standard
StandardTokenizer.jj
Log:
#23466 - StandardTokenzier with CJK support(sigram)
Revision Changes Path
1.4 +8 -2 jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
Index: StandardTokenizer.jj
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- StandardTokenizer.jj 5 Jun 2002 04:54:47 -0000 1.3
+++ StandardTokenizer.jj 30 Sep 2003 16:31:49 -0000 1.4
@@ -56,7 +56,7 @@
STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
-//UNICODE_INPUT = true;
+ UNICODE_INPUT = true;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
@@ -125,6 +125,7 @@
(<LETTER>|<DIGIT>)*
>
+| < SIGRAM: (<CJK>)+ >
| < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters
[
@@ -133,7 +134,11 @@
"\u00c0"-"\u00d6",
"\u00d8"-"\u00f6",
"\u00f8"-"\u00ff",
- "\u0100"-"\u1fff",
+ "\u0100"-"\u1fff"
+ ]
+ >
+| < #CJK: // non-alphabets
+ [
"\u3040"-"\u318f",
"\u3300"-"\u337f",
"\u3400"-"\u3d2d",
@@ -182,6 +187,7 @@
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
+ token = <SIGRAM> |
token = <EOF>
)
{