You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ad...@apache.org on 2023/04/17 07:41:38 UTC
[doris-thirdparty] branch clucene updated: [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50)
This is an automated email from the ASF dual-hosted git repository.
adonisling pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new b4e4f8f [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50)
b4e4f8f is described below
commit b4e4f8f4b8f3407057fa7061a83dbfa16071642b
Author: airborne12 <ai...@gmail.com>
AuthorDate: Mon Apr 17 15:41:32 2023 +0800
[enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50)
---
src/core/CLucene/analysis/Analyzers.cpp | 46 ++++++++++++++++++++++++++++
src/core/CLucene/analysis/Analyzers.h | 6 ++++
src/core/CLucene/util/stringUtil.cpp | 1 +
src/core/CLucene/util/stringUtil.h | 53 ++++++++++++++++++++++++++++++++-
src/test/analysis/TestAnalysis.cpp | 2 ++
5 files changed, 107 insertions(+), 1 deletion(-)
diff --git a/src/core/CLucene/analysis/Analyzers.cpp b/src/core/CLucene/analysis/Analyzers.cpp
index 6ca4183..c5c9b6a 100644
--- a/src/core/CLucene/analysis/Analyzers.cpp
+++ b/src/core/CLucene/analysis/Analyzers.cpp
@@ -13,6 +13,52 @@
CL_NS_USE(util)
CL_NS_DEF(analysis)
+template<>
+void CharTokenizer<char>::normalize(const char *src, int64_t len, char *dst) {
+ to_lower((const uint8_t *) src, len, (uint8_t *) dst);
+}
+
+template<>
+Token *CharTokenizer<char>::next(Token *token) {
+ int32_t length = 0;
+ int32_t start = offset;
+ while (true) {
+ char c;
+ offset++;
+ if (bufferIndex >= dataLen) {
+ dataLen = input->read((const void **) &ioBuffer, 1, LUCENE_IO_BUFFER_SIZE);
+ if (dataLen == -1)
+ dataLen = 0;
+ bufferIndex = 0;
+ }
+ if (dataLen <= 0) {
+ if (length > 0)
+ break;
+ else
+ return NULL;
+ } else
+ c = ioBuffer[bufferIndex++];
+ if (isTokenChar(c)) {// if it's a token TCHAR
+
+ if (length == 0)// start of token
+ start = offset - 1;
+
+ //buffer[length++] = normalize(c); // buffer it, normalized
+ buffer[length++] = c;
+ if (length == LUCENE_MAX_WORD_LEN)// buffer overflow!
+ break;
+
+ } else if (length > 0)// at non-Letter w/ chars
+ break; // return 'em
+ }
+ char buffer_copy[LUCENE_MAX_WORD_LEN + 1];
+ normalize(buffer, length, buffer_copy);
+ buffer_copy[length] = 0;
+ token->set(buffer_copy, start, start + length);
+
+ return token;
+};
+
template<typename T>
LetterTokenizer<T>::LetterTokenizer(CL_NS(util)::Reader* in):
CharTokenizer<T>(in) {
diff --git a/src/core/CLucene/analysis/Analyzers.h b/src/core/CLucene/analysis/Analyzers.h
index fc65204..6ab819e 100644
--- a/src/core/CLucene/analysis/Analyzers.h
+++ b/src/core/CLucene/analysis/Analyzers.h
@@ -10,6 +10,7 @@
#include "CLucene/util/VoidList.h"
#include "CLucene/util/VoidMap.h"
#include "CLucene/util/CLStreams.h"
+#include "CLucene/util/stringUtil.h"
#include "AnalysisHeader.h"
CL_NS_DEF(analysis)
@@ -33,6 +34,11 @@ protected:
* to, e.g., lowercase tokens. */
virtual T normalize(const T c) const{return c;};
+ virtual void normalize(const T *src, int64_t len, T *dst) {
+ for (; src < src + len; ++src, ++dst)
+ *dst = normalize(*src);
+ };
+
public:
explicit CharTokenizer(CL_NS(util)::Reader* in):Tokenizer(in),
offset(0),
diff --git a/src/core/CLucene/util/stringUtil.cpp b/src/core/CLucene/util/stringUtil.cpp
index 68437f5..07aa155 100644
--- a/src/core/CLucene/util/stringUtil.cpp
+++ b/src/core/CLucene/util/stringUtil.cpp
@@ -1,6 +1,7 @@
//
// Created by 姜凯 on 2022/9/20.
//
+#include "CLucene/_ApiHeader.h"
#include "stringUtil.h"
template <>
diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h
index eeddce7..f6a4958 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -5,7 +5,11 @@
#ifndef _lucene_util__stringutil_H
#define _lucene_util__stringutil_H
-#include "CLucene/_ApiHeader.h"
+#ifdef __SSE2__
+#include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
+#endif
template <typename T>
const T* LUCENE_BLANK_SSTRING();
@@ -24,4 +28,51 @@ T *strDuplicate(const T *str);
template<typename T>
size_t lenOfString(const T *str);
+
+template <char not_case_lower_bound, char not_case_upper_bound>
+class LowerUpperImpl {
+public:
+ static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) {
+ const auto flip_case_mask = 'A' ^ 'a';
+
+#if defined(__SSE2__) || defined(__aarch64__)
+ const auto bytes_sse = sizeof(__m128i);
+ const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
+
+ const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
+ const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
+ const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
+
+ for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) {
+ const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+ const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound),
+ _mm_cmplt_epi8(chars, v_not_case_upper_bound));
+ const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);
+ const auto cased_chars = _mm_xor_si128(chars, xor_mask);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars);
+ }
+#endif
+
+ for (; src < src_end; ++src, ++dst)
+ if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
+ *dst = *src ^ flip_case_mask;
+ else
+ *dst = *src;
+ }
+};
+
+static void to_lower(const uint8_t* src, int64_t len, uint8_t* dst) {
+ if (len <= 0) {
+ return;
+ }
+ LowerUpperImpl<'A', 'Z'>::transfer(src, src + len, dst);
+}
+
+static void to_upper(const uint8_t* src, int64_t len, uint8_t* dst) {
+ if (len <= 0) {
+ return;
+ }
+ LowerUpperImpl<'a', 'z'> lowerUpper;
+ LowerUpperImpl<'a', 'z'>::transfer(src, src + len, dst);
+}
#endif//_lucene_util__stringutil_H
diff --git a/src/test/analysis/TestAnalysis.cpp b/src/test/analysis/TestAnalysis.cpp
index 73e4337..da51aa0 100644
--- a/src/test/analysis/TestAnalysis.cpp
+++ b/src/test/analysis/TestAnalysis.cpp
@@ -78,6 +78,7 @@ void testTokenStreamField(CuTest *tc) {
void testChar(CuTest *tc) {
const char *text = "This is a test 123_test";
+ std::vector<string> result{"this","is","a","test","123","test"};
SStringReader<char> reader(text, strlen(text));
SimpleAnalyzer<char> analyzer;
TokenStream *stream = analyzer.tokenStream(NULL, &reader);
@@ -85,6 +86,7 @@ void testChar(CuTest *tc) {
int32_t count = 0;
CL_NS(analysis)::Token t;
while (stream->next(&t) != NULL) {
+ assertEquals(true, strCompare(t.termBuffer<char>(), result.at(count).c_str()) == 0);
count++;
}
//printf("count = %d\n", count);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org