You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:31 UTC
[17/24] lucene-solr:branch_7x: LUCENE-8527: Upgrade JFlex to 1.7.0.
StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
and provide UTS#51 v11.0 Emoji tokenization with the '' token type.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
new file mode 100644
index 0000000..446253d
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+ print STDERR "Usage: $script_name -v <version>\n";
+ print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
+ if ($version);
+ exit 1;
+}
+my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
+my $underscore_version = $version;
+$underscore_version =~ s/\./_/g;
+my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
+my $output_filename = "${class_name}.java";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Ignore;
+
+/**
+ * This class was automatically generated by ${script_name}
+ * from: ${url}
+ *
+ * emoji-test.txt contains emoji char sequences, which are represented as
+ * tokenization tests in this class.
+ *
+ */
+\@Ignore
+public class ${class_name} extends BaseTokenStreamTestCase {
+
+ public void test(Analyzer analyzer) throws Exception {
+ for (int i = 0 ; i < tests.length ; i += 2) {
+ String test = tests[i + 1];
+ try {
+ assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
+ } catch (Throwable t) {
+ throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);
+ }
+ }
+ }
+
+ private String[] tests = new String[] {
+__HEADER__
+
+my @tests = split /\r?\n/, get_URL_content($url);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+open OUT, ">$output_path"
+ || die "Error opening '$output_path' for writing: $!";
+
+print STDERR "Writing '$output_path'...";
+
+print OUT $header;
+
+my $isFirst = 1;
+for my $line (@tests) {
+ next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
+
+ print OUT ",\n\n" unless $isFirst;
+ $isFirst = 0;
+
+ # Example line: 1F46E 1F3FB 200D 2642 FE0F ; fully-qualified # ๐ฎ๐ปโโ๏ธ man police officer: light skin tone
+ $line =~ s/\s+$//; # Trim trailing whitespace
+ $line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
+ print OUT " \"$line\",\n";
+ my ($test_string) = $line =~ /^(.*?)\s*;/;
+ $test_string =~ s/([0-9A-F]+)/\\u$1/g;
+ $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
+ $test_string =~ s/\s//g;
+ print OUT " \"${test_string}\"";
+}
+print OUT " };\n}\n";
+close OUT;
+print STDERR "done.\n";
+
+
+# sub above_BMP_char_to_surrogates
+#
+# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
+# to the corresponding UTF-16 surrogate pair
+#
+# Assumption: input string is a sequence more than four hex digits
+#
+sub above_BMP_char_to_surrogates {
+ my $ch = hex(shift);
+ my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
+ my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
+ return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
+}
+
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+sub get_URL_content {
+ my $url = shift;
+ print STDERR "Retrieving '$url'...";
+ my $user_agent = LWP::UserAgent->new;
+ my $request = HTTP::Request->new(GET => $url);
+ my $response = $user_agent->request($request);
+ unless ($response->is_success) {
+ print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+ exit 1;
+ }
+ print STDERR "done.\n";
+ return $response->content;
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
index 3004035..dd16cb6 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@@ -40,8 +40,6 @@ $underscore_version =~ s/\./_/g;
my $class_name = "WordBreakTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
-package org.apache.lucene.analysis;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -59,6 +57,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+package org.apache.lucene.analysis.standard;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore;
@@ -81,7 +81,7 @@ import org.junit.Ignore;
* \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
- * [\\uFF10-\\uFF19] (Full-width Arabic digits)
+ * [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
@@ -91,6 +91,7 @@ __HEADER__
my $codepoints = [];
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
+my $regional_indicator_codepoints = [];
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
# Using lowercase versions of property value names to allow for case-
# insensitive comparison with the names in the Unicode data files.
@@ -98,7 +99,9 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
- {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
+ {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
+ 'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
+parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@@ -124,10 +127,21 @@ for my $line (@tests) {
$test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*รท\s*//; # Trim leading break character
+
+ # TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
+ # รท 200D รท 261D รท # รท [0.2] ZERO WIDTH JOINER (ZWJ_FE) รท [999.0] WHITE UP POINTING INDEX (E_Base) รท [0.3]
+ if ($sequence =~ /^200D\s*รท\s*261D$/) {
+ print OUT " // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
+ next;
+ }
+
my @tokens = ();
+ my $isfirst = 0;
for my $candidate (split /\s*รท\s*/, $sequence) {
+ $isfirst = 1;
my @chars = ();
- my $has_wanted_char = 0;
+ my $has_wanted_chars = 0;
+ my $prev_char_regional_indicator = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
my $hexchar = $1;
if (4 == length($hexchar)) {
@@ -135,12 +149,21 @@ for my $line (@tests) {
} else {
push @chars, above_BMP_char_to_surrogates($hexchar);
}
- unless ($has_wanted_char) {
- $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
+ unless ($has_wanted_chars) {
+ my $codepoint = hex($hexchar);
+ if (defined($codepoints->[$codepoint])) {
+ $has_wanted_chars = 1;
+ } elsif (defined($regional_indicator_codepoints->[$codepoint])) {
+ if (1 == $prev_char_regional_indicator) {
+ $has_wanted_chars = 1; # must be 2 regional indicators in a row
+ } else {
+ $prev_char_regional_indicator = 1;
+ }
+ }
}
}
- if ($has_wanted_char) {
- push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
+ if ($has_wanted_chars) {
+ push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
}
}
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";