You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/01/23 10:31:04 UTC

[34/41] lucene-solr:jira/solr-11702: LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly hyphenation patterns with indicator >= 7.

LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly hyphenation patterns with indicator >= 7.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f5e22670
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f5e22670
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f5e22670

Branch: refs/heads/jira/solr-11702
Commit: f5e2267097df5bee3942c719facbca137a56f3f8
Parents: fc6f3a4
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon Jan 22 08:46:01 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon Jan 22 08:46:01 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                                   |  3 +++
 .../compound/hyphenation/HyphenationTree.java        |  4 ++--
 .../compound/TestCompoundWordTokenFilter.java        | 15 +++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5e22670/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 6b90215..e95d066 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -146,6 +146,9 @@ Bug Fixes
 
 * LUCENE-8130: Fix NullPointerException from TermStates.toString() (Mike McCandless)
 
+* LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly
+  hyphenation patterns with indicator >= 7. (Holger Bruch via Adrien Grand)
+
 Other
 
 * LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5e22670/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
index 0f7dd2b..3c72b4f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
@@ -89,7 +89,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
     StringBuilder buf = new StringBuilder();
     byte v = vspace.get(k++);
     while (v != 0) {
-      char c = (char) ((v >>> 4) - 1 + '0');
+      char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0');
       buf.append(c);
       c = (char) (v & 0x0f);
       if (c == 0) {
@@ -151,7 +151,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
     StringBuilder buf = new StringBuilder();
     byte v = vspace.get(k++);
     while (v != 0) {
-      char c = (char) ((v >>> 4) - 1);
+      char c = (char) (((v & 0xf0 )>>> 4) - 1);
       buf.append(c);
       c = (char) (v & 0x0f);
       if (c == 0) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5e22670/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index ed3abe4..67a1bb4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
     }
 
   }
+  
+  public void testLucene8124() throws Exception {
+    InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(is);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        whitespaceMockTokenizer(
+                "Rindfleisch"),
+        hyphenator);
+
+    // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter 
+    assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
+  }
+
 
   public static interface MockRetainAttribute extends Attribute {
     void setRetain(boolean attr);