You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/29 10:47:06 UTC

[lucene-solr] branch master updated: LUCENE-9702: Hunspell: support alternate casing for short language codes (#2253)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new ff943ec  LUCENE-9702: Hunspell: support alternate casing for short language codes (#2253)
ff943ec is described below

commit ff943ece8fced15d7d585469929f97405b45513e
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 29 11:46:45 2021 +0100

    LUCENE-9702: Hunspell: support alternate casing for short language codes (#2253)
---
 .../src/java/org/apache/lucene/analysis/hunspell/Dictionary.java  | 4 +++-
 .../org/apache/lucene/analysis/hunspell/SpellCheckerTest.java     | 5 +++++
 .../src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff    | 2 ++
 .../src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic    | 4 ++++
 .../src/test/org/apache/lucene/analysis/hunspell/dotless_i.good   | 8 ++++++++
 .../src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong  | 6 ++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index f38ab59..4dc76ab 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -374,7 +374,9 @@ public class Dictionary {
         fullStrip = true;
       } else if ("LANG".equals(firstWord)) {
         language = singleArgument(reader, line);
-        alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
+        int underscore = language.indexOf("_");
+        String langCode = underscore < 0 ? language : language.substring(0, underscore);
+        alternateCasing = langCode.equals("tr") || langCode.equals("az");
       } else if ("BREAK".equals(firstWord)) {
         breaks = parseBreaks(reader, line);
       } else if ("FORBIDDENWORD".equals(firstWord)) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index a51a43b..4c5601b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -57,6 +57,11 @@ public class SpellCheckerTest extends StemmerTestBase {
   }
 
   @Test
+  public void dotless_i() throws Exception {
+    doTest("dotless_i");
+  }
+
+  @Test
   public void needAffixOnAffixes() throws Exception {
     doTest("needaffix5");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff
new file mode 100644
index 0000000..e9b3a60
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff
@@ -0,0 +1,2 @@
+SET UTF-8
+LANG tr
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic
new file mode 100644
index 0000000..07452ba
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic
@@ -0,0 +1,4 @@
+3
+iç
+ışık
+Diyarbakır
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.good
new file mode 100644
index 0000000..50f046b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.good
@@ -0,0 +1,8 @@
+Diyarbakır
+DİYARBAKIR
+iç
+İç
+ışık
+Işık
+İÇ
+IŞIK
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong
new file mode 100644
index 0000000..ddd49d3
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong
@@ -0,0 +1,6 @@
+Diyarbakir
+DIYARBAKIR
+Iç
+İşık
+IÇ
+İŞIK