You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/07/06 10:20:08 UTC

lucene-solr:branch_6x: LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x ee21bb3b3 -> 6c730ab74


LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6c730ab7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6c730ab7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6c730ab7

Branch: refs/heads/branch_6x
Commit: 6c730ab74f2ac8a865d2d514344db18572f059da
Parents: ee21bb3
Author: Mike McCandless <mi...@apache.org>
Authored: Wed Jul 6 06:17:32 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Wed Jul 6 06:19:19 2016 -0400

----------------------------------------------------------------------
 .../apache/lucene/analysis/uk/mapping_uk.txt    |  19 +++++++++++++++++++
 .../apache/lucene/analysis/uk/ukrainian.dict    | Bin 1707759 -> 1989243 bytes
 .../analysis/uk/TestUkrainianAnalyzer.java      |  19 +++++++++++++------
 3 files changed, 32 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c730ab7/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
new file mode 100644
index 0000000..1142604
--- /dev/null
+++ b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
@@ -0,0 +1,19 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This map normalizes some characters used in Ukrainian text
+"\u2019" => "'"
+"\u02BC" => "'"
+
+# Remove accent
+"\u0301" => ""

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c730ab7/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
index 679e392..2468970 100644
Binary files a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict and b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c730ab7/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
index 87d3be5..a38fc63 100644
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
@@ -37,22 +37,29 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
 
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "\u0426\u044f \u043f'\u0454\u0441\u0430 \u0443 \u0441\u0432\u043e\u044e \u0447\u0435\u0440\u0433\u0443 \u0440\u0443\u0445\u0430\u0454\u0442\u044c\u0441\u044f \u043f\u043e \u043a\u043e\u043b\u0443.",
-                     new String[] { "\u043f'\u0454\u0441\u0430", "\u0447\u0435\u0440\u0433\u0430", "\u0440\u0443\u0445\u0430\u0442\u0438\u0441\u044f", "\u043a\u043e\u043b\u0430", "\u043a\u043e\u043b\u043e", "\u043a\u043e\u043b\u043e", "\u043a\u0456\u043b", "\u043a\u0456\u043b" });
+    assertAnalyzesTo(a, "\u0426\u044f \u043f'\u0454\u0441\u0430, \u0443 \u0441\u0432\u043e\u044e \u0447\u0435\u0440\u0433\u0443, \u0440\u0443\u0445\u0430\u0454\u0442\u044c\u0441\u044f \u043f\u043e \u0435\u043c\u043e\u0446\u0456\u0439\u043d\u043e-\u043d\u0430\u043f\u0440\u0443\u0436\u0435\u043d\u043e\u043c\u0443 \u043a\u043e\u043b\u0443 \u0437\u0430 \u0440\u0438\u0442\u043c-\u0435\u043d\u0434-\u0431\u043b\u044e\u0437\u043e\u043c.",
+                     new String[] { "\u043f'\u0454\u0441\u0430", "\u0447\u0435\u0440\u0433\u0430", "\u0440\u0443\u0445\u0430\u0442\u0438\u0441\u044f", "\u0435\u043c\u043e\u0446\u0456\u0439\u043d\u043e", "\u043d\u0430\u043f\u0440\u0443\u0436\u0435\u043d\u0438\u0439", "\u043a\u043e\u043b\u0430", "\u043a\u043e\u043b\u043e", "\u043a\u0456\u043b", "\u0440\u0438\u0442\u043c", "\u0435\u043d\u0434", "\u0431\u043b\u044e\u0437" });
     a.close();
   }
 
   public void testSpecialCharsTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "\u0426\u044f \u043f\u02bc\u0454\u0441\u0430, \u0443 \u0441\u0432\u043e\u0301\u044e \u0447\u0435\u0440\u0433\u0443, \u0440\u0443\u0445\u0430\u0454\u0442\u044c\u0441\u044f \u043f\u043e \u043a\u043e\u043b\u0443.",
-                     new String[] { "\u043f'\u0454\u0441\u0430", "\u0447\u0435\u0440\u0433\u0430", "\u0440\u0443\u0445\u0430\u0442\u0438\u0441\u044f", "\u043a\u043e\u043b\u0430", "\u043a\u043e\u043b\u043e", "\u043a\u043e\u043b\u043e", "\u043a\u0456\u043b", "\u043a\u0456\u043b" });
+    assertAnalyzesTo(a, "\u0426\u044f \u043f\u02bc\u0454\u0441\u0430, \u0443 \u0441\u0432\u043e\u0301\u044e \u0447\u0435\u0440\u0433\u0443 \u0440\u0443\u0445\u0430\u0454\u0442\u044c\u0441\u044f.",
+                     new String[] { "\u043f'\u0454\u0441\u0430", "\u0447\u0435\u0440\u0433\u0430", "\u0440\u0443\u0445\u0430\u0442\u0438\u0441\u044f" });
     a.close();
   }
 
   public void testCapsTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "\u0426\u0435\u0439 \u0427\u0430\u0439\u043a\u043e\u0432\u0441\u044c\u043a\u0438\u0439.",
-                     new String[] { "\u0447\u0430\u0439\u043a\u043e\u0432\u0441\u044c\u043a\u0438\u0439" });
+    assertAnalyzesTo(a, "\u0426\u0435\u0439 \u0427\u0430\u0439\u043a\u043e\u0432\u0441\u044c\u043a\u0438\u0439 \u0456 \u0490\u0435\u0442\u0435.",
+                     new String[] { "\u0447\u0430\u0439\u043a\u043e\u0432\u0441\u044c\u043a\u0438\u0439", "\u0491\u0435\u0442\u0435" });
+    a.close();
+  }
+
+  public void testSampleSentence() throws Exception {
+    Analyzer a = new UkrainianMorfologikAnalyzer();
+    assertAnalyzesTo(a, "\u0426\u0435 \u2014 \u043f\u0440\u043e\u0435\u043a\u0442 \u0433\u0435\u043d\u0435\u0440\u0443\u0432\u0430\u043d\u043d\u044f \u0441\u043b\u043e\u0432\u043d\u0438\u043a\u0430 \u0437 \u0442\u0435\u0433\u0430\u043c\u0438 \u0447\u0430\u0441\u0442\u0438\u043d \u043c\u043e\u0432\u0438 \u0434\u043b\u044f \u0443\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u043e\u0457 \u043c\u043e\u0432\u0438.",
+                     new String[] { "\u043f\u0440\u043e\u0435\u043a\u0442", "\u0433\u0435\u043d\u0435\u0440\u0443\u0432\u0430\u043d\u043d\u044f", "\u0441\u043b\u043e\u0432\u043d\u0438\u043a", "\u0442\u0435\u0433", "\u0447\u0430\u0441\u0442\u0438\u043d\u0430", "\u043c\u043e\u0432\u0430", "\u0443\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430", "\u0443\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0438\u0439", "\u043c\u043e\u0432\u0430" });
     a.close();
   }