You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by eh...@apache.org on 2003/09/30 18:31:49 UTC

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/standard StandardTokenizer.jj

ehatcher    2003/09/30 09:31:49

  Modified:    src/java/org/apache/lucene/analysis/standard
                        StandardTokenizer.jj
  Log:
  #23466 - StandardTokenzier with CJK support(sigram)
  
  Revision  Changes    Path
  1.4       +8 -2      jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
  
  Index: StandardTokenizer.jj
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- StandardTokenizer.jj	5 Jun 2002 04:54:47 -0000	1.3
  +++ StandardTokenizer.jj	30 Sep 2003 16:31:49 -0000	1.4
  @@ -56,7 +56,7 @@
     STATIC = false;
   //IGNORE_CASE = true;
   //BUILD_PARSER = false;
  -//UNICODE_INPUT = true;
  +  UNICODE_INPUT = true;
     USER_CHAR_STREAM = true;
     OPTIMIZE_TOKEN_MANAGER = true;
   //DEBUG_TOKEN_MANAGER = true;
  @@ -125,6 +125,7 @@
       (<LETTER>|<DIGIT>)*
     >
   
  +| < SIGRAM: (<CJK>)+ >
   | < #ALPHA: (<LETTER>)+>
   | < #LETTER:					  // unicode letters
         [
  @@ -133,7 +134,11 @@
          "\u00c0"-"\u00d6",
          "\u00d8"-"\u00f6",
          "\u00f8"-"\u00ff",
  -       "\u0100"-"\u1fff",
  +       "\u0100"-"\u1fff"
  +      ]
  +  >
  +| < #CJK:             // non-alphabets
  +      [
          "\u3040"-"\u318f",
          "\u3300"-"\u337f",
          "\u3400"-"\u3d2d",
  @@ -182,6 +187,7 @@
       token = <EMAIL> |
       token = <HOST> |
       token = <NUM> |
  +    token = <SIGRAM> |
       token = <EOF>
      )
       {