You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/20 13:25:26 UTC
[doris-thirdparty] branch clucene updated: [Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)
This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 5428108f [Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)
5428108f is described below
commit 5428108ff1c04d68d501ad6c57c60f17156e0933
Author: airborne12 <ai...@gmail.com>
AuthorDate: Tue Jun 20 21:25:20 2023 +0800
[Fix](standard analyzer) change standard analyzer CJK tokenizer, align it to newest standard analzyer mode (#92)
---
src/core/CLucene/analysis/standard/StandardTokenizer.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/core/CLucene/analysis/standard/StandardTokenizer.cpp b/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
index 98d98283..d030e1ab 100644
--- a/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
+++ b/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
@@ -66,7 +66,7 @@ CL_NS_DEF2(analysis,standard)
/* otherMatches is a condition (possibly compound) under which a character
** that's not an ALNUM or UNDERSCORE can be considered not to break the
** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
- #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+ #define CONSUME_WORD _CONSUME_AS_LONG_AS((ALNUM || UNDERSCORE) && !_CJK)
/*
** Consume CJK characters
@@ -150,7 +150,7 @@ CL_NS_DEF2(analysis,standard)
continue;
} else if (SPACE) {
continue;
- } else if (ALPHA || UNDERSCORE) {
+ } else if ((ALPHA || UNDERSCORE) && !_CJK) {
tokenStart = rdPos;
t = ReadAlphaNum(ch,t);
if ( t != NULL) return t;
@@ -265,6 +265,9 @@ CL_NS_DEF2(analysis,standard)
int ch = prev;
CONSUME_WORD;
+ if (_CJK) {
+ unReadChar();
+ }
if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
switch(ch) { /* What follows the first alphanum segment? */
case '.':
@@ -293,7 +296,7 @@ CL_NS_DEF2(analysis,standard)
str.appendChar(prev);
int ch = prev;
- CONSUME_CJK;
+ //CONSUME_CJK;
}
return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org