You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/06/14 19:17:23 UTC
[lucene] 02/03: hunspell (minor): reduce allocations when reading the dictionary's morphological data (#12323)
This is an automated email from the ASF dual-hosted git repository.
donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit a2b47b0923658594c4ddac0b810f7998e6332c16
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Thu Jun 1 11:37:38 2023 +0200
hunspell (minor): reduce allocations when reading the dictionary's morphological data (#12323)
there can be many entries with morph data, so we'd better avoid compiling and matching regexes and even stream allocation
(cherry picked from commit 4bf1b9420990de1453b9b4bb145d7d37dc750f07)
---
.../lucene/analysis/hunspell/Dictionary.java | 26 ++++++++++++++--------
1 file changed, 17 insertions(+), 9 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 88ed129bd66..b7a4029a523 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -992,7 +992,7 @@ public class Dictionary {
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
- if (morphStart >= 0 && morphStart < line.length()) {
+ if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData =
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
@@ -1321,14 +1321,22 @@ public class Dictionary {
if (morphData.isBlank()) {
return Collections.emptyList();
}
- return Arrays.stream(morphData.split("\\s+"))
- .filter(
- s ->
- s.length() > 3
- && Character.isLetter(s.charAt(0))
- && Character.isLetter(s.charAt(1))
- && s.charAt(2) == ':')
- .collect(Collectors.toList());
+
+ List<String> result = null;
+ int start = 0;
+ for (int i = 0; i <= morphData.length(); i++) {
+ if (i == morphData.length() || Character.isWhitespace(morphData.charAt(i))) {
+ if (i - start > 3
+ && Character.isLetter(morphData.charAt(start))
+ && Character.isLetter(morphData.charAt(start + 1))
+ && morphData.charAt(start + 2) == ':') {
+ if (result == null) result = new ArrayList<>();
+ result.add(morphData.substring(start, i));
+ }
+ start = i + 1;
+ }
+ }
+ return result == null ? List.of() : result;
}
boolean hasFlag(IntsRef forms, char flag) {