You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2019/08/09 19:50:03 UTC

[lucenenet] 02/09: Lucene.Net.Tests.ICU.Search.VectorHighlight.BreakIteratorBoundaryScannerTest: Modified original TestSentenceBoundary test to skip east Asian languages because the mock doesn't override them. Added TestICUWordBoundary and TestICUSentenceBoundary tests to confirm that the boundary scanner works the same with ICU4N as it does with ICU4J with default settings.

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 50623b6f84f8d05bf9e6c99539d67ee56047b38c
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Fri Aug 9 13:50:02 2019 +0700

    Lucene.Net.Tests.ICU.Search.VectorHighlight.BreakIteratorBoundaryScannerTest: Modified original TestSentenceBoundary test to skip east Asian languages because the mock doesn't override them. Added TestICUWordBoundary and TestICUSentenceBoundary tests to confirm that the boundary scanner works the same with ICU4N as it does with ICU4J with default settings.
---
 .../BreakIteratorBoundaryScannerTest.cs            | 60 +++++++++++++++++++++-
 .../Support/TestJDKBreakIterator.cs                |  8 +--
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/src/Lucene.Net.Tests.Highlighter/VectorHighlight/BreakIteratorBoundaryScannerTest.cs b/src/Lucene.Net.Tests.Highlighter/VectorHighlight/BreakIteratorBoundaryScannerTest.cs
index 6210fa8..be527c8 100644
--- a/src/Lucene.Net.Tests.Highlighter/VectorHighlight/BreakIteratorBoundaryScannerTest.cs
+++ b/src/Lucene.Net.Tests.Highlighter/VectorHighlight/BreakIteratorBoundaryScannerTest.cs
@@ -1,6 +1,6 @@
 #if FEATURE_BREAKITERATOR
 using ICU4N.Text;
-using Lucene.Net.ICU.Support;
+using Lucene.Net.Attributes;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using NUnit.Framework;
@@ -50,10 +50,30 @@ namespace Lucene.Net.Search.VectorHighlight
             assertEquals(start, scanner.FindEndOffset(text, start));
         }
 
+        // LUCENENET specific - Confirmed that ICU4J 60.1 behaves like this by default...
+        [Test, LuceneNetSpecific]
+        public void TestICUWordBoundary()
+        {
+            StringBuilder text = new StringBuilder(TEXT);
+            BreakIterator bi = BreakIterator.GetWordInstance(CultureInfo.InvariantCulture);
+            IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+
+            int start = TEXT.IndexOf("formance", StringComparison.Ordinal);
+            int expected = TEXT.IndexOf("performance", StringComparison.Ordinal);
+            TestFindStartOffset(text, start, expected, scanner);
+
+            expected = TEXT.IndexOf(", full", StringComparison.Ordinal);
+            TestFindEndOffset(text, start, expected, scanner);
+        }
+
+        // LUCENENET specific - this is the original Lucene test with a mock BreakIterator that
+        // is intended to act (sort of) like the JDK
         [Test]
         public void TestWordBoundary()
         {
             StringBuilder text = new StringBuilder(TEXT);
+            // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just
+            // an ICU BreakIterator with custom rules applied.
             BreakIterator bi = JdkBreakIterator.GetWordInstance(CultureInfo.InvariantCulture);
             IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
 
@@ -65,11 +85,49 @@ namespace Lucene.Net.Search.VectorHighlight
             TestFindEndOffset(text, start, expected, scanner);
         }
 
+        // LUCENENET specific - Confirmed that ICU4J 60.1 behaves like this by default...
+        [Test, LuceneNetSpecific]
+        public void TestICUSentenceBoundary()
+        {
+            StringBuilder text = new StringBuilder(TEXT);
+            // we test this with default locale, its randomized by LuceneTestCase
+            BreakIterator bi = BreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture);
+            IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+
+            int start = TEXT.IndexOf("any application");
+            int expected = TEXT.IndexOf("It is a");
+            TestFindStartOffset(text, start, expected, scanner);
+
+            expected = TEXT.IndexOf("application that requires") + "application that requires\n".Length;
+            TestFindEndOffset(text, start, expected, scanner);
+        }
+
+        // LUCENENET specific - this is the original Lucene test with a mock BreakIterator that
+        // is intended to act (sort of) like the JDK
         [Test]
         public void TestSentenceBoundary()
         {
+            // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just
+            // an ICU BreakIterator with custom rules applied. East Asian
+            // languages are skipped because the DictionaryBasedBreakIterator is not overridden by the rules.
+            switch (CultureInfo.CurrentCulture.TwoLetterISOLanguageName)
+            {
+                case "th": // Thai
+                case "lo": // Lao
+                case "my": // Burmese
+                case "km": // Khmer
+                case "ja": // Japanese
+                case "ko": // Korean
+                case "zh": // Chinese
+                    Assume.That(false, "This test does not apply to East Asian languages.");
+                    break;
+            }
+
             StringBuilder text = new StringBuilder(TEXT);
             // we test this with default locale, its randomized by LuceneTestCase
+
+            // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just
+            // an ICU BreakIterator with custom rules applied.
             BreakIterator bi = JdkBreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture);
             IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
 
diff --git a/src/dotnet/Lucene.Net.Tests.ICU/Support/TestJDKBreakIterator.cs b/src/dotnet/Lucene.Net.Tests.ICU/Support/TestJDKBreakIterator.cs
index ea8651c..1e5efbe 100644
--- a/src/dotnet/Lucene.Net.Tests.ICU/Support/TestJDKBreakIterator.cs
+++ b/src/dotnet/Lucene.Net.Tests.ICU/Support/TestJDKBreakIterator.cs
@@ -262,9 +262,9 @@ namespace Lucene.Net.Tests.ICU.Support
             Assert.AreEqual(400, bi.Last());
         }
 
-        // LUCENENET TODO: This test doesn't pass. We need to customize line iteration in order to get it to. However,
-        // none of the defaults set in lucene use line iteration, so this is low priority. Hopefully, someone works it
-        // out and makes a contribution.
+        // NOTE: This test doesn't pass. We need to customize line iteration in order to get it to. However,
+        // none of the defaults set in lucene use line iteration, so this is low priority. Leaving in place
+        // in case we need to make JDK style line breaks in the future.
         static readonly String LINE_TEXT =
             "Apache\tLucene(TM) is a high-\nperformance, full-featured text search engine library written entirely in Java.";
 
@@ -274,7 +274,7 @@ namespace Lucene.Net.Tests.ICU.Support
         }
 
         [Test]
-        [Ignore("LUCENENET TODO: Setup JDK style line iteration (see jdksent.rbbi and jdkword.rbbi)")]
+        [Ignore("Not required to confirm compatibility with Java, as this is not required by Lucene's tests.")]
         public void TestLineIteration()
         {
             BreakIterator bi = GetLineInstance(System.Globalization.CultureInfo.InvariantCulture);