You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/20 13:25:26 UTC

[doris-thirdparty] branch clucene updated: [Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 5428108f [Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)
5428108f is described below

commit 5428108ff1c04d68d501ad6c57c60f17156e0933
Author: airborne12 <ai...@gmail.com>
AuthorDate: Tue Jun 20 21:25:20 2023 +0800

    [Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)
---
 src/core/CLucene/analysis/standard/StandardTokenizer.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/core/CLucene/analysis/standard/StandardTokenizer.cpp b/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
index 98d98283..d030e1ab 100644
--- a/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
+++ b/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
@@ -66,7 +66,7 @@ CL_NS_DEF2(analysis,standard)
   /* otherMatches is a condition (possibly compound) under which a character
   ** that's not an ALNUM or UNDERSCORE can be considered not to break the
   ** span.  Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
-  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS((ALNUM || UNDERSCORE) && !_CJK)
   
   /*
   ** Consume CJK characters
@@ -150,7 +150,7 @@ CL_NS_DEF2(analysis,standard)
         continue;
       } else if (SPACE) {
         continue;
-      } else if (ALPHA || UNDERSCORE) {
+      } else if ((ALPHA || UNDERSCORE) && !_CJK) {
         tokenStart = rdPos;
         t = ReadAlphaNum(ch,t);
         if ( t != NULL) return t;
@@ -265,6 +265,9 @@ CL_NS_DEF2(analysis,standard)
 		  int ch = prev;
 
 		  CONSUME_WORD;
+                  if (_CJK) {
+                      unReadChar();
+                  }
 		  if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
 			  switch(ch) { /* What follows the first alphanum segment? */
 				  case '.':
@@ -293,7 +296,7 @@ CL_NS_DEF2(analysis,standard)
 		  str.appendChar(prev);
 		  int ch = prev;
 
-		  CONSUME_CJK;
+		  //CONSUME_CJK;
 	  }
 	  return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org