You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:31 UTC
[17/24] lucene-solr:branch_7x: LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '' token type.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
new file mode 100644
index 0000000..446253d
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+    print STDERR "Usage: $script_name -v <version>\n";
+    print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
+        if ($version);
+    exit 1;
+}
+my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
+my $underscore_version = $version;
+$underscore_version =~ s/\./_/g;
+my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
+my $output_filename = "${class_name}.java";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Ignore;
+
+/**
+ * This class was automatically generated by ${script_name}
+ * from: ${url}
+ *
+ * emoji-test.txt contains emoji char sequences, which are represented as
+ * tokenization tests in this class.
+ * 
+ */
+\@Ignore
+public class ${class_name} extends BaseTokenStreamTestCase {
+
+  public void test(Analyzer analyzer) throws Exception {
+    for (int i = 0 ; i < tests.length ; i += 2) {
+      String test = tests[i + 1];
+      try {
+        assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
+      } catch (Throwable t) {
+        throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);        
+      }
+    }
+  }
+
+  private String[] tests = new String[] {
+__HEADER__
+
+my @tests = split /\r?\n/, get_URL_content($url);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+open OUT, ">$output_path"
+    || die "Error opening '$output_path' for writing: $!";
+
+print STDERR "Writing '$output_path'...";
+
+print OUT $header;
+
+my $isFirst = 1;
+for my $line (@tests) {
+    next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
+
+    print OUT ",\n\n" unless $isFirst;
+    $isFirst = 0;
+
+    # Example line: 1F46E 1F3FB 200D 2642 FE0F                 ; fully-qualified     # 👮🏻‍♂️ man police officer: light skin tone
+    $line =~ s/\s+$//;     # Trim trailing whitespace
+    $line =~ s/\t/  /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
+    print OUT "    \"$line\",\n";
+    my ($test_string) = $line =~ /^(.*?)\s*;/;
+    $test_string =~ s/([0-9A-F]+)/\\u$1/g;
+    $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
+    $test_string =~ s/\s//g;
+    print OUT "    \"${test_string}\"";
+}
+print OUT "  };\n}\n";
+close OUT;
+print STDERR "done.\n";
+
+
+# sub above_BMP_char_to_surrogates
+#
+# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
+# to the corresponding UTF-16 surrogate pair
+#
+# Assumption: input string is a sequence more than four hex digits
+#
+sub above_BMP_char_to_surrogates {
+    my $ch = hex(shift);
+    my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
+    my $low_surrogate  = 0xDC00 + ($ch & 0x3FF);
+    return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
+}
+
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+sub get_URL_content {
+    my $url = shift;
+    print STDERR "Retrieving '$url'...";
+    my $user_agent = LWP::UserAgent->new;
+    my $request = HTTP::Request->new(GET => $url);
+    my $response = $user_agent->request($request);
+    unless ($response->is_success) {
+        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+        exit 1;
+    }
+    print STDERR "done.\n";
+    return $response->content;
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
index 3004035..dd16cb6 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@@ -40,8 +40,6 @@ $underscore_version =~ s/\./_/g;
 my $class_name = "WordBreakTestUnicode_${underscore_version}";
 my $output_filename = "${class_name}.java";
 my $header =<<"__HEADER__";
-package org.apache.lucene.analysis;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -59,6 +57,8 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
+package org.apache.lucene.analysis.standard;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.junit.Ignore;
@@ -81,7 +81,7 @@ import org.junit.Ignore;
  *    \\p{WordBreak = Hebrew_Letter}
  *    \\p{WordBreak = Katakana}
  *    \\p{WordBreak = Numeric}         (Excludes full-width Arabic digits)
- *    [\\uFF10-\\uFF19]                (Full-width Arabic digits)
+ *    [\\uFF10-\\uFF19]                 (Full-width Arabic digits)
  */
 \@Ignore
 public class ${class_name} extends BaseTokenStreamTestCase {
@@ -91,6 +91,7 @@ __HEADER__
 
 my $codepoints = [];
 map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
+my $regional_indicator_codepoints = [];
 # Complex_Context is an alias for 'SA', which is used in LineBreak.txt
 # Using lowercase versions of property value names to allow for case-
 # insensitive comparison with the names in the Unicode data files.
@@ -98,7 +99,9 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
 parse_Unicode_data_file($scripts_url, $codepoints, 
                         {'han' => 1, 'hiragana' => 1});
 parse_Unicode_data_file($word_break_url, $codepoints,
-                        {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
+                        {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
+                         'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
+parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
 my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
 
 my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@@ -124,10 +127,21 @@ for my $line (@tests) {
   $test_string =~ s/\\u000D/\\r/g;
   $test_string =~ s/\\u0022/\\\"/g;
   $sequence =~ s/^\s*÷\s*//; # Trim leading break character
+  
+  # TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
+  # ÷ 200D ÷ 261D ÷  #  ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
+  if ($sequence =~ /^200D\s*÷\s*261D$/) {
+    print OUT "    // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
+    next;
+  }
+  
   my @tokens = ();
+  my $isfirst = 0;
   for my $candidate (split /\s*÷\s*/, $sequence) {
+    $isfirst = 1;
     my @chars = ();
-    my $has_wanted_char = 0;
+    my $has_wanted_chars = 0;
+    my $prev_char_regional_indicator = 0;
     while ($candidate =~ /([0-9A-F]+)/gi) {
       my $hexchar = $1;
       if (4 == length($hexchar)) {
@@ -135,12 +149,21 @@ for my $line (@tests) {
       } else {
         push @chars, above_BMP_char_to_surrogates($hexchar);
       }
-      unless ($has_wanted_char) {
-        $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
+      unless ($has_wanted_chars) {
+        my $codepoint = hex($hexchar);
+        if (defined($codepoints->[$codepoint])) {
+          $has_wanted_chars = 1;
+        } elsif (defined($regional_indicator_codepoints->[$codepoint])) {
+          if (1 == $prev_char_regional_indicator) {
+            $has_wanted_chars = 1; # must be 2 regional indicators in a row
+          } else {
+            $prev_char_regional_indicator = 1;
+          }
+        }
       }
     }
-    if ($has_wanted_char) {
-      push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
+    if ($has_wanted_chars) {
+      push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
     }
   }
   print OUT "    assertAnalyzesTo(analyzer, \"${test_string}\",\n";