You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/14 22:51:13 UTC
[01/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 45d2d2e7d -> ba9221483
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
new file mode 100644
index 0000000..3004035
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@@ -0,0 +1,232 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
+ print STDERR "Usage: $script_name -v <version>\n";
+ print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
+ if ($version);
+ exit 1;
+}
+my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
+my $scripts_url = "${url_prefix}/Scripts.txt";
+my $line_break_url = "${url_prefix}/LineBreak.txt";
+my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
+my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
+my $underscore_version = $version;
+$underscore_version =~ s/\./_/g;
+my $class_name = "WordBreakTestUnicode_${underscore_version}";
+my $output_filename = "${class_name}.java";
+my $header =<<"__HEADER__";
+package org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Ignore;
+
+/**
+ * This class was automatically generated by ${script_name}
+ * from: ${url_prefix}/auxiliary/WordBreakTest.txt
+ *
+ * WordBreakTest.txt indicates the points in the provided character sequences
+ * at which conforming implementations must and must not break words. This
+ * class tests for expected token extraction from each of the test sequences
+ * in WordBreakTest.txt, where the expected tokens are those character
+ * sequences bounded by word breaks and containing at least one character
+ * from one of the following character sets:
+ *
+ * \\p{Script = Han} (From $scripts_url)
+ * \\p{Script = Hiragana}
+ * \\p{LineBreak = Complex_Context} (From $line_break_url)
+ * \\p{WordBreak = ALetter} (From $word_break_url)
+ * \\p{WordBreak = Hebrew_Letter}
+ * \\p{WordBreak = Katakana}
+ * \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
+ * [\\uFF10-\\uFF19] (Full-width Arabic digits)
+ */
+\@Ignore
+public class ${class_name} extends BaseTokenStreamTestCase {
+
+ public void test(Analyzer analyzer) throws Exception {
+__HEADER__
+
+my $codepoints = [];
+map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
+# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
+# Using lowercase versions of property value names to allow for case-
+# insensitive comparison with the names in the Unicode data files.
+parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
+parse_Unicode_data_file($scripts_url, $codepoints,
+ {'han' => 1, 'hiragana' => 1});
+parse_Unicode_data_file($word_break_url, $codepoints,
+ {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
+my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+open OUT, ">$output_path"
+ || die "Error opening '$output_path' for writing: $!";
+
+print STDERR "Writing '$output_path'...";
+
+print OUT $header;
+
+for my $line (@tests) {
+ next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
+ # Example line: � 0001 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ my ($sequence) = $line =~ /^(.*?)\s*\#/;
+ $line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
+ print OUT " // $line\n";
+ $sequence =~ s/\s*�\s*$//; # Trim trailing break character
+ my $test_string = $sequence;
+ $test_string =~ s/\s*�\s*/\\u/g;
+ $test_string =~ s/\s*�\s*/\\u/g;
+ $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
+ $test_string =~ s/\\u000A/\\n/g;
+ $test_string =~ s/\\u000D/\\r/g;
+ $test_string =~ s/\\u0022/\\\"/g;
+ $sequence =~ s/^\s*�\s*//; # Trim leading break character
+ my @tokens = ();
+ for my $candidate (split /\s*�\s*/, $sequence) {
+ my @chars = ();
+ my $has_wanted_char = 0;
+ while ($candidate =~ /([0-9A-F]+)/gi) {
+ my $hexchar = $1;
+ if (4 == length($hexchar)) {
+ push @chars, $hexchar;
+ } else {
+ push @chars, above_BMP_char_to_surrogates($hexchar);
+ }
+ unless ($has_wanted_char) {
+ $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
+ }
+ }
+ if ($has_wanted_char) {
+ push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
+ }
+ }
+ print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
+ print OUT " new String[] { ";
+ print OUT join(", ", @tokens), " });\n\n";
+}
+
+print OUT " }\n}\n";
+close OUT;
+print STDERR "done.\n";
+
+
+# sub above_BMP_char_to_surrogates
+#
+# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
+# to the corresponding UTF-16 surrogate pair
+#
+# Assumption: input string is a sequence more than four hex digits
+#
+sub above_BMP_char_to_surrogates {
+ my $ch = hex(shift);
+ my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
+ my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
+ return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
+}
+
+
+# sub parse_Unicode_data_file
+#
+# Downloads and parses the specified Unicode data file, parses it, and
+# extracts code points assigned any of the given property values, defining
+# the corresponding array position in the passed-in target array.
+#
+# Takes in the following parameters:
+#
+# - URL of the Unicode data file to download and parse
+# - Reference to target array
+# - Reference to hash of property values to get code points for
+#
+sub parse_Unicode_data_file {
+ my $url = shift;
+ my $target = shift;
+ my $wanted_property_values = shift;
+ my $content = get_URL_content($url);
+ print STDERR "Parsing '$url'...";
+ my @lines = split /\r?\n/, $content;
+ for (@lines) {
+ s/\s*#.*//; # Strip trailing comments
+ s/\s+$//; # Strip trailing space
+ next unless (/\S/); # Skip empty lines
+ my ($start, $end, $property_value);
+ if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
+ # 00AA ; LATIN
+ $start = $end = hex $1;
+ $property_value = lc $2; # Property value names are case-insensitive
+ } elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
+ # 0AE6..0AEF ; Gujarati
+ $start = hex $1;
+ $end = hex $2;
+ $property_value = lc $3; # Property value names are case-insensitive
+ } else {
+ next;
+ }
+ if (defined($wanted_property_values->{$property_value})) {
+ for my $code_point ($start..$end) {
+ $target->[$code_point] = 1;
+ }
+ }
+ }
+ print STDERR "done.\n";
+}
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+sub get_URL_content {
+ my $url = shift;
+ print STDERR "Retrieving '$url'...";
+ my $user_agent = LWP::UserAgent->new;
+ my $request = HTTP::Request->new(GET => $url);
+ my $response = $user_agent->request($request);
+ unless ($response->is_success) {
+ print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+ exit 1;
+ }
+ print STDERR "done.\n";
+ return $response->content;
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/package.html
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/package.html b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/package.html
new file mode 100644
index 0000000..f7535b2
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/package.html
@@ -0,0 +1,26 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in spatial/ -->
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Classes to support <code>StandardAnalyzer</code> component testing
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
index 2b21103..3caaf54 100644
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
@@ -20,10 +20,10 @@ import java.util.Collection;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.core.SolrCore;
import org.carrot2.core.LanguageCode;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
index ae6739e..238d387 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
@@ -16,10 +16,6 @@
*/
package org.apache.solr.core;
-import javax.naming.Context;
-import javax.naming.InitialContext;
-import javax.naming.NamingException;
-import javax.naming.NoInitialContextException;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
@@ -48,13 +44,17 @@ import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.naming.Context;
+import javax.naming.InitialContext;
+import javax.naming.NamingException;
+import javax.naming.NoInitialContextException;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedStopFilterFactory.java b/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedStopFilterFactory.java
index 853cf85..393f662 100644
--- a/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedStopFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedStopFilterFactory.java
@@ -18,9 +18,9 @@ package org.apache.solr.rest.schema.analysis;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.rest.ManagedResource;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java
index d3b0ab0..6e3d82c 100644
--- a/solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java
@@ -278,11 +278,11 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1}, null, false));
- tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
+ tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.LowerCaseFilter");
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1}, null, false));
- tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.StopFilter");
+ tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1,1}, null, false));
@@ -311,7 +311,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6}, null, false));
- tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
+ tokenList = valueResult.get("org.apache.lucene.analysis.LowerCaseFilter");
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 6 tokens", 6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
@@ -320,7 +320,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertToken(tokenList.get(3), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("the", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6}, null, false));
- tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
+ tokenList = valueResult.get("org.apache.lucene.analysis.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
index d2ef555..2ed00cc 100644
--- a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
@@ -209,7 +209,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8}, null, false));
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9}, null, true));
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10}, null, false));
- tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
+ tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 10);
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
@@ -222,7 +222,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8}, null, false));
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9}, null, true));
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10}, null, false));
- tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
+ tokenList = indexPart.get("org.apache.lucene.analysis.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
@@ -258,12 +258,12 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2}, null, false));
- tokenList = queryPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
+ tokenList = queryPart.get("org.apache.lucene.analysis.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2}, null, false));
- tokenList = queryPart.get("org.apache.lucene.analysis.core.StopFilter");
+ tokenList = queryPart.get("org.apache.lucene.analysis.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1,1}, null, false));
@@ -416,7 +416,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
- tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
+ tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java b/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
index 65f3242..fdf64ff 100644
--- a/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
+++ b/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
@@ -23,13 +23,13 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
[08/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
deleted file mode 100644
index 6c6ddc8..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.Arrays;
-import java.util.Random;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockGraphTokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.TestUtil;
-
-public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
-
- // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
- @Slow
- public void testLargePartiallyMatchingToken() throws Exception {
- // TODO: get these lists of chars matching a property from ICU4J
- // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
- char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
-
- // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
- int[] WordBreak_Format_chars // only the first char in ranges
- = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
- 0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
-
- // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
- int[] WordBreak_Extend_chars // only the first char in ranges
- = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
- 0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
- 0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
- 0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
- 0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
- 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
- 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
- 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
- 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
- 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
- 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
- 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
- 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
- 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
- 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
- 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
- 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
- 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
- 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
- 0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
-
- StringBuilder builder = new StringBuilder();
- int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
- for (int i = 0 ; i < numChars ; ) {
- builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
- ++i;
- if (random().nextBoolean()) {
- int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
- for (int j = 0; j < numFormatExtendChars; ++j) {
- int codepoint;
- if (random().nextBoolean()) {
- codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
- } else {
- codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
- }
- char[] chars = Character.toChars(codepoint);
- builder.append(chars);
- i += chars.length;
- }
- }
- }
- StandardTokenizer ts = new StandardTokenizer();
- ts.setReader(new StringReader(builder.toString()));
- ts.reset();
- while (ts.incrementToken()) { }
- ts.end();
- ts.close();
-
- int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
- ts.setMaxTokenLength(newBufferSize); // try a different buffer size
- ts.setReader(new StringReader(builder.toString()));
- ts.reset();
- while (ts.incrementToken()) { }
- ts.end();
- ts.close();
- }
-
- public void testHugeDoc() throws IOException {
- StringBuilder sb = new StringBuilder();
- char whitespace[] = new char[4094];
- Arrays.fill(whitespace, ' ');
- sb.append(whitespace);
- sb.append("testing 1234");
- String input = sb.toString();
- StandardTokenizer tokenizer = new StandardTokenizer();
- tokenizer.setReader(new StringReader(input));
- BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
- }
-
- private Analyzer a;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
- return new TokenStreamComponents(tokenizer);
- }
- };
- }
-
- @Override
- public void tearDown() throws Exception {
- a.close();
- super.tearDown();
- }
-
- public void testArmenian() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b 13 \u0574\u056b\u056c\u056b\u0578\u0576 \u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568 (4,600` \u0570\u0561\u0575\u0565\u0580\u0565\u0576 \u057e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u0578\u0582\u0574) \u0563\u0580\u057e\u0565\u056c \u0565\u0576 \u056f\u0561\u0574\u0561\u057e\u0578\u0580\u0576\u0565\u0580\u056b \u056f\u0578\u0572\u0574\u056b\u0581 \u0578\u0582 \u0570\u0561\u0574\u0561\u0580\u0575\u0561 \u0562\u0578\u056c\u0578\u0580 \u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568 \u056f\u0561\u0580\u0578\u0572 \u0567 \u056d\u0574\u0562\u0561\u0563\u0580\u0565\u056c \u0581\u0561\u0576\u056f\u0561\u0581 \u0574\u0561\u0580\u0564 \u0578\u057e \u056f\u0561\u0580\u0578\u0572 \u0567 \u0562\u0561\u0581\u0565\u056c \u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b \u056f\u0561\u0575\u0584\u0568\u0589",
- new String[] { "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b", "13", "\u0574\u056b\u056c\u056b\u0578\u0576", "\u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568", "4,600", "\u0570\u0561\u0575\u0565\u0580\u0565\u0576", "\u057e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u0578\u0582\u0574", "\u0563\u0580\u057e\u0565\u056c", "\u0565\u0576", "\u056f\u0561\u0574\u0561\u057e\u0578\u0580\u0576\u0565\u0580\u056b", "\u056f\u0578\u0572\u0574\u056b\u0581",
- "\u0578\u0582", "\u0570\u0561\u0574\u0561\u0580\u0575\u0561", "\u0562\u0578\u056c\u0578\u0580", "\u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568", "\u056f\u0561\u0580\u0578\u0572", "\u0567", "\u056d\u0574\u0562\u0561\u0563\u0580\u0565\u056c", "\u0581\u0561\u0576\u056f\u0561\u0581", "\u0574\u0561\u0580\u0564", "\u0578\u057e", "\u056f\u0561\u0580\u0578\u0572", "\u0567", "\u0562\u0561\u0581\u0565\u056c", "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b", "\u056f\u0561\u0575\u0584\u0568" } );
- }
-
- public void testAmharic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u12ca\u12aa\u1354\u12f5\u12eb \u12e8\u1263\u1208 \u1265\u12d9 \u124b\u1295\u124b \u12e8\u1270\u121f\u120b \u1275\u12ad\u12ad\u1208\u129b\u1293 \u1290\u133b \u1218\u12dd\u1308\u1260 \u12d5\u12cd\u1240\u1275 (\u12a2\u1295\u1233\u12ed\u12ad\u120e\u1352\u12f2\u12eb) \u1290\u12cd\u1362 \u121b\u1295\u129b\u12cd\u121d",
- new String[] { "\u12ca\u12aa\u1354\u12f5\u12eb", "\u12e8\u1263\u1208", "\u1265\u12d9", "\u124b\u1295\u124b", "\u12e8\u1270\u121f\u120b", "\u1275\u12ad\u12ad\u1208\u129b\u1293", "\u1290\u133b", "\u1218\u12dd\u1308\u1260", "\u12d5\u12cd\u1240\u1275", "\u12a2\u1295\u1233\u12ed\u12ad\u120e\u1352\u12f2\u12eb", "\u1290\u12cd", "\u121b\u1295\u129b\u12cd\u121d" } );
- }
-
- public void testArabic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0627\u0644\u0641\u064a\u0644\u0645 \u0627\u0644\u0648\u062b\u0627\u0626\u0642\u064a \u0627\u0644\u0623\u0648\u0644 \u0639\u0646 \u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u064a\u0633\u0645\u0649 \"\u0627\u0644\u062d\u0642\u064a\u0642\u0629 \u0628\u0627\u0644\u0623\u0631\u0642\u0627\u0645: \u0642\u0635\u0629 \u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627\" (\u0628\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629: Truth in Numbers: The Wikipedia Story)\u060c \u0633\u064a\u062a\u0645 \u0625\u0637\u0644\u0627\u0642\u0647 \u0641\u064a 2008.",
- new String[] { "\u0627\u0644\u0641\u064a\u0644\u0645", "\u0627\u0644\u0648\u062b\u0627\u0626\u0642\u064a", "\u0627\u0644\u0623\u0648\u0644", "\u0639\u0646", "\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627", "\u064a\u0633\u0645\u0649", "\u0627\u0644\u062d\u0642\u064a\u0642\u0629", "\u0628\u0627\u0644\u0623\u0631\u0642\u0627\u0645", "\u0642\u0635\u0629", "\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627",
- "\u0628\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "\u0633\u064a\u062a\u0645", "\u0625\u0637\u0644\u0627\u0642\u0647", "\u0641\u064a", "2008" } );
- }
-
- public void testAramaic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0718\u071d\u0729\u071d\u0726\u0715\u071d\u0710 (\u0710\u0722\u0713\u0720\u071d\u0710: Wikipedia) \u0717\u0718 \u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710 \u071a\u0710\u072a\u072c\u0710 \u0715\u0710\u0722\u071b\u072a\u0722\u071b \u0712\u0720\u072b\u0722\u0308\u0710 \u0723\u0713\u071d\u0710\u0308\u0710\u0702 \u072b\u0721\u0717 \u0710\u072c\u0710 \u0721\u0722 \u0721\u0308\u0720\u072c\u0710 \u0715\"\u0718\u071d\u0729\u071d\" \u0718\"\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710\"\u0700",
- new String[] { "\u0718\u071d\u0729\u071d\u0726\u0715\u071d\u0710", "\u0710\u0722\u0713\u0720\u071d\u0710", "Wikipedia", "\u0717\u0718", "\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710", "\u071a\u0710\u072a\u072c\u0710", "\u0715\u0710\u0722\u071b\u072a\u0722\u071b", "\u0712\u0720\u072b\u0722\u0308\u0710", "\u0723\u0713\u071d\u0710\u0308\u0710", "\u072b\u0721\u0717",
- "\u0710\u072c\u0710", "\u0721\u0722", "\u0721\u0308\u0720\u072c\u0710", "\u0715", "\u0718\u071d\u0729\u071d", "\u0718", "\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710"});
- }
-
- public void testBengali() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u098f\u0987 \u09ac\u09bf\u09b6\u09cd\u09ac\u0995\u09cb\u09b7 \u09aa\u09b0\u09bf\u099a\u09be\u09b2\u09a8\u09be \u0995\u09b0\u09c7 \u0989\u0987\u0995\u09bf\u09ae\u09bf\u09a1\u09bf\u09af\u09bc\u09be \u09ab\u09be\u0989\u09a8\u09cd\u09a1\u09c7\u09b6\u09a8 (\u098f\u0995\u099f\u09bf \u0985\u09b2\u09be\u09ad\u099c\u09a8\u0995 \u09b8\u0982\u09b8\u09cd\u09a5\u09be)\u0964 \u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be\u09b0 \u09b6\u09c1\u09b0\u09c1 \u09e7\u09eb \u099c\u09be\u09a8\u09c1\u09af\u09bc\u09be\u09b0\u09bf, \u09e8\u09e6\u09e6\u09e7 \u09b8\u09be\u09b2\u09c7\u0964 \u098f\u0996\u09a8 \u09aa\u09b0\u09cd\u09af\u09a8\u09cd\u09a4 \u09e8\u09e6\u09e6\u099f\u09bf\u09b0\u0993 \u09ac\u09c7\u09b6\u09c0 \u09ad\u09be\u09b7\u09be\u09af\u09bc \u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be \u09b0\u09af\u09bc\u09c7\u099b\u09c7\u0964",
- new String[] { "\u098f\u0987", "\u09ac\u09bf\u09b6\u09cd\u09ac\u0995\u09cb\u09b7", "\u09aa\u09b0\u09bf\u099a\u09be\u09b2\u09a8\u09be", "\u0995\u09b0\u09c7", "\u0989\u0987\u0995\u09bf\u09ae\u09bf\u09a1\u09bf\u09af\u09bc\u09be", "\u09ab\u09be\u0989\u09a8\u09cd\u09a1\u09c7\u09b6\u09a8", "\u098f\u0995\u099f\u09bf", "\u0985\u09b2\u09be\u09ad\u099c\u09a8\u0995", "\u09b8\u0982\u09b8\u09cd\u09a5\u09be", "\u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be\u09b0",
- "\u09b6\u09c1\u09b0\u09c1", "\u09e7\u09eb", "\u099c\u09be\u09a8\u09c1\u09af\u09bc\u09be\u09b0\u09bf", "\u09e8\u09e6\u09e6\u09e7", "\u09b8\u09be\u09b2\u09c7", "\u098f\u0996\u09a8", "\u09aa\u09b0\u09cd\u09af\u09a8\u09cd\u09a4", "\u09e8\u09e6\u09e6\u099f\u09bf\u09b0\u0993", "\u09ac\u09c7\u09b6\u09c0", "\u09ad\u09be\u09b7\u09be\u09af\u09bc", "\u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be", "\u09b0\u09af\u09bc\u09c7\u099b\u09c7" });
- }
-
- public void testFarsi() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0648\u06cc\u06a9\u06cc \u067e\u062f\u06cc\u0627\u06cc \u0627\u0646\u06af\u0644\u06cc\u0633\u06cc \u062f\u0631 \u062a\u0627\u0631\u06cc\u062e \u06f2\u06f5 \u062f\u06cc \u06f1\u06f3\u06f7\u06f9 \u0628\u0647 \u0635\u0648\u0631\u062a \u0645\u06a9\u0645\u0644\u06cc \u0628\u0631\u0627\u06cc \u062f\u0627\u0646\u0634\u0646\u0627\u0645\u0647\u0654 \u062a\u062e\u0635\u0635\u06cc \u0646\u0648\u067e\u062f\u06cc\u0627 \u0646\u0648\u0634\u062a\u0647 \u0634\u062f.",
- new String[] { "\u0648\u06cc\u06a9\u06cc", "\u067e\u062f\u06cc\u0627\u06cc", "\u0627\u0646\u06af\u0644\u06cc\u0633\u06cc", "\u062f\u0631", "\u062a\u0627\u0631\u06cc\u062e", "\u06f2\u06f5", "\u062f\u06cc", "\u06f1\u06f3\u06f7\u06f9", "\u0628\u0647", "\u0635\u0648\u0631\u062a", "\u0645\u06a9\u0645\u0644\u06cc",
- "\u0628\u0631\u0627\u06cc", "\u062f\u0627\u0646\u0634\u0646\u0627\u0645\u0647\u0654", "\u062a\u062e\u0635\u0635\u06cc", "\u0646\u0648\u067e\u062f\u06cc\u0627", "\u0646\u0648\u0634\u062a\u0647", "\u0634\u062f" });
- }
-
- public void testGreek() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0393\u03c1\u03ac\u03c6\u03b5\u03c4\u03b1\u03b9 \u03c3\u03b5 \u03c3\u03c5\u03bd\u03b5\u03c1\u03b3\u03b1\u03c3\u03af\u03b1 \u03b1\u03c0\u03cc \u03b5\u03b8\u03b5\u03bb\u03bf\u03bd\u03c4\u03ad\u03c2 \u03bc\u03b5 \u03c4\u03bf \u03bb\u03bf\u03b3\u03b9\u03c3\u03bc\u03b9\u03ba\u03cc wiki, \u03ba\u03ac\u03c4\u03b9 \u03c0\u03bf\u03c5 \u03c3\u03b7\u03bc\u03b1\u03af\u03bd\u03b5\u03b9 \u03cc\u03c4\u03b9 \u03ac\u03c1\u03b8\u03c1\u03b1 \u03bc\u03c0\u03bf\u03c1\u03b5\u03af \u03bd\u03b1 \u03c0\u03c1\u03bf\u03c3\u03c4\u03b5\u03b8\u03bf\u03cd\u03bd \u03ae \u03bd\u03b1 \u03b1\u03bb\u03bb\u03ac\u03be\u03bf\u03c5\u03bd \u03b1\u03c0\u03cc \u03c4\u03bf\u03bd \u03ba\u03b1\u03b8\u03ad\u03bd\u03b1.",
- new String[] { "\u0393\u03c1\u03ac\u03c6\u03b5\u03c4\u03b1\u03b9", "\u03c3\u03b5", "\u03c3\u03c5\u03bd\u03b5\u03c1\u03b3\u03b1\u03c3\u03af\u03b1", "\u03b1\u03c0\u03cc", "\u03b5\u03b8\u03b5\u03bb\u03bf\u03bd\u03c4\u03ad\u03c2", "\u03bc\u03b5", "\u03c4\u03bf", "\u03bb\u03bf\u03b3\u03b9\u03c3\u03bc\u03b9\u03ba\u03cc", "wiki", "\u03ba\u03ac\u03c4\u03b9", "\u03c0\u03bf\u03c5",
- "\u03c3\u03b7\u03bc\u03b1\u03af\u03bd\u03b5\u03b9", "\u03cc\u03c4\u03b9", "\u03ac\u03c1\u03b8\u03c1\u03b1", "\u03bc\u03c0\u03bf\u03c1\u03b5\u03af", "\u03bd\u03b1", "\u03c0\u03c1\u03bf\u03c3\u03c4\u03b5\u03b8\u03bf\u03cd\u03bd", "\u03ae", "\u03bd\u03b1", "\u03b1\u03bb\u03bb\u03ac\u03be\u03bf\u03c5\u03bd", "\u03b1\u03c0\u03cc", "\u03c4\u03bf\u03bd", "\u03ba\u03b1\u03b8\u03ad\u03bd\u03b1" });
- }
-
- public void testThai() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0e01\u0e32\u0e23\u0e17\u0e35\u0e48\u0e44\u0e14\u0e49\u0e15\u0e49\u0e2d\u0e07\u0e41\u0e2a\u0e14\u0e07\u0e27\u0e48\u0e32\u0e07\u0e32\u0e19\u0e14\u0e35. \u0e41\u0e25\u0e49\u0e27\u0e40\u0e18\u0e2d\u0e08\u0e30\u0e44\u0e1b\u0e44\u0e2b\u0e19? \u0e51\u0e52\u0e53\u0e54",
- new String[] { "\u0e01\u0e32\u0e23\u0e17\u0e35\u0e48\u0e44\u0e14\u0e49\u0e15\u0e49\u0e2d\u0e07\u0e41\u0e2a\u0e14\u0e07\u0e27\u0e48\u0e32\u0e07\u0e32\u0e19\u0e14\u0e35", "\u0e41\u0e25\u0e49\u0e27\u0e40\u0e18\u0e2d\u0e08\u0e30\u0e44\u0e1b\u0e44\u0e2b\u0e19", "\u0e51\u0e52\u0e53\u0e54" });
- }
-
- public void testLao() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0eaa\u0eb2\u0e97\u0eb2\u0ea5\u0eb0\u0e99\u0eb0\u0ea5\u0eb1\u0e94 \u0e9b\u0eb0\u0e8a\u0eb2\u0e97\u0eb4\u0e9b\u0eb0\u0ec4\u0e95 \u0e9b\u0eb0\u0e8a\u0eb2\u0e8a\u0ebb\u0e99\u0ea5\u0eb2\u0ea7",
- new String[] { "\u0eaa\u0eb2\u0e97\u0eb2\u0ea5\u0eb0\u0e99\u0eb0\u0ea5\u0eb1\u0e94", "\u0e9b\u0eb0\u0e8a\u0eb2\u0e97\u0eb4\u0e9b\u0eb0\u0ec4\u0e95", "\u0e9b\u0eb0\u0e8a\u0eb2\u0e8a\u0ebb\u0e99\u0ea5\u0eb2\u0ea7" });
- }
-
- public void testTibetan() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0f66\u0fa3\u0f7c\u0f53\u0f0b\u0f58\u0f5b\u0f7c\u0f51\u0f0b\u0f51\u0f44\u0f0b\u0f63\u0f66\u0f0b\u0f60\u0f51\u0f72\u0f66\u0f0b\u0f56\u0f7c\u0f51\u0f0b\u0f61\u0f72\u0f42\u0f0b\u0f58\u0f72\u0f0b\u0f49\u0f58\u0f66\u0f0b\u0f42\u0f7c\u0f44\u0f0b\u0f60\u0f55\u0f7a\u0f63\u0f0b\u0f51\u0f74\u0f0b\u0f42\u0f4f\u0f7c\u0f44\u0f0b\u0f56\u0f62\u0f0b\u0f67\u0f0b\u0f45\u0f44\u0f0b\u0f51\u0f42\u0f7a\u0f0b\u0f58\u0f5a\u0f53\u0f0b\u0f58\u0f46\u0f72\u0f66\u0f0b\u0f66\u0f7c\u0f0d \u0f0d",
- new String[] { "\u0f66\u0fa3\u0f7c\u0f53", "\u0f58\u0f5b\u0f7c\u0f51", "\u0f51\u0f44", "\u0f63\u0f66", "\u0f60\u0f51\u0f72\u0f66", "\u0f56\u0f7c\u0f51", "\u0f61\u0f72\u0f42",
- "\u0f58\u0f72", "\u0f49\u0f58\u0f66", "\u0f42\u0f7c\u0f44", "\u0f60\u0f55\u0f7a\u0f63", "\u0f51\u0f74", "\u0f42\u0f4f\u0f7c\u0f44", "\u0f56\u0f62",
- "\u0f67", "\u0f45\u0f44", "\u0f51\u0f42\u0f7a", "\u0f58\u0f5a\u0f53", "\u0f58\u0f46\u0f72\u0f66", "\u0f66\u0f7c" });
- }
-
- /*
- * For chinese, tokenize as char (these can later form bigrams or whatever)
- */
- public void testChinese() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u6211\u662f\u4e2d\u56fd\u4eba\u3002 \uff11\uff12\uff13\uff14 \uff34\uff45\uff53\uff54\uff53 ",
- new String[] { "\u6211", "\u662f", "\u4e2d", "\u56fd", "\u4eba", "\uff11\uff12\uff13\uff14", "\uff34\uff45\uff53\uff54\uff53"});
- }
-
- public void testEmpty() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
- }
-
- /* test various jira issues this analyzer is related to */
-
- public void testLUCENE1545() throws Exception {
- /*
- * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
- * The word "mo\u0364chte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
- * Expected result is only on token "mo\u0364chte".
- */
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "mo\u0364chte", new String[] { "mo\u0364chte" });
- }
-
- /* Tests from StandardAnalyzer, just to show behavior is similar */
- public void testAlphanumericSA() throws Exception {
- // alphanumeric tokens
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
- }
-
- public void testDelimitersSA() throws Exception {
- // other delimiters: "-", "/", ","
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
- }
-
- public void testApostrophesSA() throws Exception {
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
- }
-
- public void testNumericSA() throws Exception {
- // floating point, serial, model numbers, ip addresses, etc.
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
- }
-
- public void testTextWithNumbersSA() throws Exception {
- // numbers
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
- }
-
- public void testVariousTextSA() throws Exception {
- // various
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
- }
-
- public void testKoreanSA() throws Exception {
- // Korean words
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\uc548\ub155\ud558\uc138\uc694 \ud55c\uae00\uc785\ub2c8\ub2e4", new String[]{"\uc548\ub155\ud558\uc138\uc694", "\ud55c\uae00\uc785\ub2c8\ub2e4"});
- }
-
- public void testOffsets() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
- new String[] {"David", "has", "5000", "bones"},
- new int[] {0, 6, 10, 15},
- new int[] {5, 9, 14, 20});
- }
-
- public void testTypes() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
- new String[] {"David", "has", "5000", "bones"},
- new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
- }
-
- public void testUnicodeWordBreaks() throws Exception {
- WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
- wordBreakTest.test(a);
- }
-
- public void testSupplementary() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\U00029b05\u8271\u935f\u41f9\u612f\u701b",
- new String[] {"\U00029b05", "\u8271", "\u935f", "\u41f9", "\u612f", "\u701b"},
- new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
- }
-
- public void testKorean() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\ud6c8\ubbfc\uc815\uc74c",
- new String[] { "\ud6c8\ubbfc\uc815\uc74c" },
- new String[] { "<HANGUL>" });
- }
-
- public void testJapanese() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u4eee\u540d\u9063\u3044 \u30ab\u30bf\u30ab\u30ca",
- new String[] { "\u4eee", "\u540d", "\u9063", "\u3044", "\u30ab\u30bf\u30ab\u30ca" },
- new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
- }
-
- public void testCombiningMarks() throws Exception {
- checkOneTerm(a, "\u3055\u3099", "\u3055\u3099"); // hiragana
- checkOneTerm(a, "\u30b5\u3099", "\u30b5\u3099"); // katakana
- checkOneTerm(a, "\u58f9\u3099", "\u58f9\u3099"); // ideographic
- checkOneTerm(a, "\uc544\u3099", "\uc544\u3099"); // hangul
- }
-
- /**
- * Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
- * and/or \p{MidNum} should trigger a token split.
- */
- public void testMid() throws Exception {
- // ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
-
- // '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
-
- // ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
-
- // Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
-
- // Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
-
- // '_' is in \p{WB:ExtendNumLet}
-
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A:B", new String[] { "A:B_A:B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A::B", new String[] { "A:B_A", "B" });
-
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1.2", new String[] { "1.2_1.2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A.B", new String[] { "A.B_A.B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1..2", new String[] { "1.2_1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A..B", new String[] { "A.B_A", "B" });
-
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,2", new String[] { "1,2_1,2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,,2", new String[] { "1,2_1", "2" });
-
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A.:B", new String[] { "C_A", "B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A:.B", new String[] { "C_A", "B" });
-
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1,.2", new String[] { "3_1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
- }
-
-
-
- /** blast some random strings through the analyzer */
- public void testRandomStrings() throws Exception {
- Analyzer analyzer = new StandardAnalyzer();
- checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
- analyzer.close();
- }
-
- /** blast some random large strings through the analyzer */
- public void testRandomHugeStrings() throws Exception {
- Analyzer analyzer = new StandardAnalyzer();
- checkRandomData(random(), analyzer, 100*RANDOM_MULTIPLIER, 8192);
- analyzer.close();
- }
-
- // Adds random graph after:
- public void testRandomHugeStringsGraphAfter() throws Exception {
- Random random = random();
- Analyzer analyzer = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
- TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
- return new TokenStreamComponents(tokenizer, tokenStream);
- }
- };
- checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
- analyzer.close();
- }
-}
[11/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
index d14ad44..f6ba905 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
@@ -20,10 +20,10 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Collection;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java
index 0301fa5..0397de7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java
@@ -17,16 +17,16 @@
package org.apache.lucene.analysis.miscellaneous;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
-
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
/**
* Factory for {@link CapitalizationFilter}.
* <p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
index 40cd210..b086c62 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
@@ -17,8 +17,8 @@
package org.apache.lucene.analysis.miscellaneous;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java
index df82ff1..bde0e59 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java
@@ -21,9 +21,9 @@ import java.text.DateFormat;
import java.text.ParseException;
import java.util.Locale;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
/** Filters all tokens that cannot be parsed to a date, using the provided {@link DateFormat}. */
public class DateRecognizerFilter extends FilteringTokenFilter {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
index 4c8a5c7..7cbd6f8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -27,7 +28,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeSource;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
index cb3e331..2255283 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
@@ -16,10 +16,10 @@
*/
package org.apache.lucene.analysis.miscellaneous;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* A TokenFilter that only keeps tokens with text contained in the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
index 7ff7834..8967c5b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
@@ -17,15 +17,15 @@
package org.apache.lucene.analysis.miscellaneous;
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import java.util.Map;
-import java.io.IOException;
-
/**
* Factory for {@link KeepWordFilter}.
* <pre class="prettyprint">
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java
index 69c1aad..5b9f48d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java
@@ -21,8 +21,8 @@ import java.io.IOException;
import java.util.Map;
import java.util.regex.Pattern;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
index 0594c63..a18711c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java
index a7ef58e..457087c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java
@@ -20,7 +20,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SetKeywordMarkerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SetKeywordMarkerFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SetKeywordMarkerFilter.java
index c4dbf78..b0d079b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SetKeywordMarkerFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SetKeywordMarkerFilter.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
index 20e013d..f80ed8a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@@ -16,22 +16,22 @@
*/
package org.apache.lucene.analysis.miscellaneous;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.InPlaceMergeSorter;
-import java.io.IOException;
-import java.util.Arrays;
-
/**
* Splits words into subwords and performs optional transformations on subword
* groups. Words are split into subwords with the following rules:
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
index 2f51a2b..6a15b55 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
@@ -16,13 +16,7 @@
*/
package org.apache.lucene.analysis.miscellaneous;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -30,7 +24,13 @@ import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import java.io.IOException;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index da104c9..87465b7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
+import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index e8b152d..0391425 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -17,27 +17,27 @@
package org.apache.lucene.analysis.nl;
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
-import java.io.IOException;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-
/**
* {@link Analyzer} for Dutch language.
* <p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
index 4110da3..c413793 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.tartarus.snowball.ext.NorwegianStemmer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
index ecdb944..769e142 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
index f24cf2a..9fdb73e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
@@ -30,7 +30,7 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
index 61475d2..37f044a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
@@ -21,8 +21,8 @@ import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
index 7436243..06ff999 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
@@ -21,16 +21,16 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.RomanianStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index db2df8a..dfe8ef3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
index 1c11e48..06aed49 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
@@ -19,11 +19,11 @@ package org.apache.lucene.analysis.snowball;
import java.io.IOException;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
import org.tartarus.snowball.SnowballProgram;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java
index 93cf7a4..d598a09 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java
@@ -17,13 +17,13 @@
package org.apache.lucene.analysis.snowball;
-import java.util.Map;
import java.io.IOException;
+import java.util.Map;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
index 43c7dad..dc6c118 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
@@ -20,13 +20,13 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
/**
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
deleted file mode 100644
index ae23dc6..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
-
-/**
- * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
- * LowerCaseFilter} and {@link StopFilter}, using a list of
- * English stop words.
- */
-public final class StandardAnalyzer extends StopwordAnalyzerBase {
-
- /** Default maximum allowed token length */
- public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
-
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- /** An unmodifiable set containing some common English words that are usually not
- useful for searching. */
- public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-
- /** Builds an analyzer with the given stop words.
- * @param stopWords stop words */
- public StandardAnalyzer(CharArraySet stopWords) {
- super(stopWords);
- }
-
- /** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
- */
- public StandardAnalyzer() {
- this(STOP_WORDS_SET);
- }
-
- /** Builds an analyzer with the stop words from the given reader.
- * @see WordlistLoader#getWordSet(Reader)
- * @param stopwords Reader to read stop words from */
- public StandardAnalyzer(Reader stopwords) throws IOException {
- this(loadStopwordSet(stopwords));
- }
-
- /**
- * Set maximum allowed token length. If a token is seen
- * that exceeds this length then it is discarded. This
- * setting only takes effect the next time tokenStream or
- * tokenStream is called.
- */
- public void setMaxTokenLength(int length) {
- maxTokenLength = length;
- }
-
- /**
- * @see #setMaxTokenLength
- */
- public int getMaxTokenLength() {
- return maxTokenLength;
- }
-
- @Override
- protected TokenStreamComponents createComponents(final String fieldName) {
- final StandardTokenizer src = new StandardTokenizer();
- src.setMaxTokenLength(maxTokenLength);
- TokenStream tok = new StandardFilter(src);
- tok = new LowerCaseFilter(tok);
- tok = new StopFilter(tok, stopwords);
- return new TokenStreamComponents(src, tok) {
- @Override
- protected void setReader(final Reader reader) {
- src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
- super.setReader(reader);
- }
- };
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
deleted file mode 100644
index a470a83..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-
-/**
- * Normalizes tokens extracted with {@link StandardTokenizer}.
- */
-public class StandardFilter extends TokenFilter {
-
- public StandardFilter(TokenStream in) {
- super(in);
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- return input.incrementToken(); // TODO: add some niceties for the new grammar
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
deleted file mode 100644
index 1e143a3..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeFactory;
-
-/** A grammar-based tokenizer constructed with JFlex.
- * <p>
- * This class implements the Word Break rules from the
- * Unicode Text Segmentation algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
- */
-
-public final class StandardTokenizer extends Tokenizer {
- /** A private instance of the JFlex-constructed scanner */
- private StandardTokenizerImpl scanner;
-
- // TODO: how can we remove these old types?!
- public static final int ALPHANUM = 0;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int APOSTROPHE = 1;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int ACRONYM = 2;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int COMPANY = 3;
- public static final int EMAIL = 4;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int HOST = 5;
- public static final int NUM = 6;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int CJ = 7;
-
- /** @deprecated (3.1) */
- @Deprecated
- public static final int ACRONYM_DEP = 8;
-
- public static final int SOUTHEAST_ASIAN = 9;
- public static final int IDEOGRAPHIC = 10;
- public static final int HIRAGANA = 11;
- public static final int KATAKANA = 12;
- public static final int HANGUL = 13;
-
- /** String token types that correspond to token type int constants */
- public static final String [] TOKEN_TYPES = new String [] {
- "<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
- "<NUM>",
- "<CJ>",
- "<ACRONYM_DEP>",
- "<SOUTHEAST_ASIAN>",
- "<IDEOGRAPHIC>",
- "<HIRAGANA>",
- "<KATAKANA>",
- "<HANGUL>"
- };
-
- public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
-
- private int skippedPositions;
-
- private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-
- /**
- * Set the max allowed token length. No tokens longer than this are emitted.
- *
- * @throws IllegalArgumentException if the given length is outside of the
- * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
- */
- public void setMaxTokenLength(int length) {
- if (length < 1) {
- throw new IllegalArgumentException("maxTokenLength must be greater than zero");
- } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
- throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
- }
- if (length != maxTokenLength) {
- maxTokenLength = length;
- scanner.setBufferSize(length);
- }
- }
-
- /** @see #setMaxTokenLength */
- public int getMaxTokenLength() {
- return maxTokenLength;
- }
-
- /**
- * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
- * the <code>input</code> to the newly created JFlex scanner.
-
- * See http://issues.apache.org/jira/browse/LUCENE-1068
- */
- public StandardTokenizer() {
- init();
- }
-
- /**
- * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
- */
- public StandardTokenizer(AttributeFactory factory) {
- super(factory);
- init();
- }
-
- private void init() {
- this.scanner = new StandardTokenizerImpl(input);
- }
-
- // this tokenizer generates three attributes:
- // term offset, positionIncrement and type
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
- @Override
- public final boolean incrementToken() throws IOException {
- clearAttributes();
- skippedPositions = 0;
-
- while(true) {
- int tokenType = scanner.getNextToken();
-
- if (tokenType == StandardTokenizerImpl.YYEOF) {
- return false;
- }
-
- if (scanner.yylength() <= maxTokenLength) {
- posIncrAtt.setPositionIncrement(skippedPositions+1);
- scanner.getText(termAtt);
- final int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
- typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
- return true;
- } else
- // When we skip a too-long term, we still increment the
- // position increment
- skippedPositions++;
- }
- }
-
- @Override
- public final void end() throws IOException {
- super.end();
- // set final offset
- int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
- offsetAtt.setOffset(finalOffset, finalOffset);
- // adjust any skipped tokens
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
- }
-
- @Override
- public void close() throws IOException {
- super.close();
- scanner.yyreset(input);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- scanner.yyreset(input);
- skippedPositions = 0;
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
deleted file mode 100644
index c8bf9e9..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ /dev/null
@@ -1,818 +0,0 @@
-/* The following code was generated by JFlex 1.6.0 */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-/**
- * This class implements Word Break rules from the Unicode Text Segmentation
- * algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * <p>
- * Tokens produced are of the following types:
- * <ul>
- * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
- * <li><NUM>: A number</li>
- * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
- * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
- * <li><HIRAGANA>: A single hiragana character</li>
- * <li><KATAKANA>: A sequence of katakana characters</li>
- * <li><HANGUL>: A sequence of Hangul characters</li>
- * </ul>
- */
-@SuppressWarnings("fallthrough")
-
-public final class StandardTokenizerImpl {
-
- /** This character denotes the end of file */
- public static final int YYEOF = -1;
-
- /** initial size of the lookahead buffer */
- private int ZZ_BUFFERSIZE = 255;
-
- /** lexical states */
- public static final int YYINITIAL = 0;
-
- /**
- * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
- * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
- * at the beginning of a line
- * l is of the form l = 2*k, k a non negative integer
- */
- private static final int ZZ_LEXSTATE[] = {
- 0, 0
- };
-
- /**
- * Translates characters to character classes
- */
- private static final String ZZ_CMAP_PACKED =
- "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
- "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
- "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
- "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
- "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
- "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
- "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
- "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
- "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
- "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
- "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
- "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
- "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
- "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
- "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
- "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
- "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
- "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
- "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
- "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
- "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
- "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
- "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
- "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
- "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
- "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
- "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
- "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
- "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
- "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
- "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
- "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
- "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
- "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
- "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
- "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
- "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
- "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
- "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
- "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
- "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
- "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
- "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
- "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
- "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
- "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
- "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
- "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
- "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
- "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
- "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
- "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
- "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
- "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
- "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
- "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
- "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
- "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
- "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
- "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
- "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
- "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
- "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
- "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
- "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
- "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
- "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
- "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
- "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
- "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
- "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
- "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
- "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
- "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
- "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
- "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
- "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
- "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
- "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
- "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
- "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
- "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
- "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
- "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
- "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
- "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
- "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
- "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
- "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
- "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
- "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
- "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
- "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
- "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
- "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
- "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
- "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
- "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
- "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
- "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
- "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
- "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
- "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
- "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
- "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
- "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
- "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
- "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
- "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
- "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
- "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
- "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
- "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
- "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
- "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
- "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
- "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
- "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
- "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
- "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
- "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
- "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
- "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
- "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
- "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
- "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
- "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
- "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
- "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
- "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
- "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
- "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
- "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
- "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
- "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
- "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
- "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
- "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
- "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
-
- /**
- * Translates characters to character classes
- */
- private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
-
- /**
- * Translates DFA states to action switch labels.
- */
- private static final int [] ZZ_ACTION = zzUnpackAction();
-
- private static final String ZZ_ACTION_PACKED_0 =
- "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
- "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
- "\1\4\1\0\2\2\2\0\1\1\1\0";
-
- private static int [] zzUnpackAction() {
- int [] result = new int[24];
- int offset = 0;
- offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
- return result;
- }
-
- private static int zzUnpackAction(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
- int l = packed.length();
- while (i < l) {
- int count = packed.charAt(i++);
- int value = packed.charAt(i++);
- do result[j++] = value; while (--count > 0);
- }
- return j;
- }
-
-
- /**
- * Translates a state to a row index in the transition table
- */
- private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
-
- private static final String ZZ_ROWMAP_PACKED_0 =
- "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
- "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
- "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
-
- private static int [] zzUnpackRowMap() {
- int [] result = new int[24];
- int offset = 0;
- offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
- return result;
- }
-
- private static int zzUnpackRowMap(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
- int l = packed.length();
- while (i < l) {
- int high = packed.charAt(i++) << 16;
- result[j++] = high | packed.charAt(i++);
- }
- return j;
- }
-
- /**
- * The transition table of the DFA
- */
- private static final int [] ZZ_TRANS = zzUnpackTrans();
-
- private static final String ZZ_TRANS_PACKED_0 =
- "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
- "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
- "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
- "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
- "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
- "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
- "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
- "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
- "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
- "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
- "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
- "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
- "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
- "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
- "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
- "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
- "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
- "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
- "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
- "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
- "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
- "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
- "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
- "\1\30\1\15\14\0\1\30";
-
- private static int [] zzUnpackTrans() {
- int [] result = new int[396];
- int offset = 0;
- offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
- return result;
- }
-
- private static int zzUnpackTrans(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
- int l = packed.length();
- while (i < l) {
- int count = packed.charAt(i++);
- int value = packed.charAt(i++);
- value--;
- do result[j++] = value; while (--count > 0);
- }
- return j;
- }
-
-
- /* error codes */
- private static final int ZZ_UNKNOWN_ERROR = 0;
- private static final int ZZ_NO_MATCH = 1;
- private static final int ZZ_PUSHBACK_2BIG = 2;
-
- /* error messages for the codes above */
- private static final String ZZ_ERROR_MSG[] = {
- "Unkown internal scanner error",
- "Error: could not match input",
- "Error: pushback value was too large"
- };
-
- /**
- * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
- */
- private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
-
- private static final String ZZ_ATTRIBUTE_PACKED_0 =
- "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
- "\2\1\2\0\1\1\1\0";
-
- private static int [] zzUnpackAttribute() {
- int [] result = new int[24];
- int offset = 0;
- offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
- return result;
- }
-
- private static int zzUnpackAttribute(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
- int l = packed.length();
- while (i < l) {
- int count = packed.charAt(i++);
- int value = packed.charAt(i++);
- do result[j++] = value; while (--count > 0);
- }
- return j;
- }
-
- /** the input device */
- private java.io.Reader zzReader;
-
- /** the current state of the DFA */
- private int zzState;
-
- /** the current lexical state */
- private int zzLexicalState = YYINITIAL;
-
- /** this buffer contains the current text to be matched and is
- the source of the yytext() string */
- private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
-
- /** the textposition at the last accepting state */
- private int zzMarkedPos;
-
- /** the current text position in the buffer */
- private int zzCurrentPos;
-
- /** startRead marks the beginning of the yytext() string in the buffer */
- private int zzStartRead;
-
- /** endRead marks the last character in the buffer, that has been read
- from input */
- private int zzEndRead;
-
- /** number of newlines encountered up to the start of the matched text */
- private int yyline;
-
- /** the number of characters up to the start of the matched text */
- private int yychar;
-
- /**
- * the number of characters from the last newline up to the start of the
- * matched text
- */
- private int yycolumn;
-
- /**
- * zzAtBOL == true <=> the scanner is currently at the beginning of a line
- */
- private boolean zzAtBOL = true;
-
- /** zzAtEOF == true <=> the scanner is at the EOF */
- private boolean zzAtEOF;
-
- /** denotes if the user-EOF-code has already been executed */
- private boolean zzEOFDone;
-
- /**
- * The number of occupied positions in zzBuffer beyond zzEndRead.
- * When a lead/high surrogate has been read from the input stream
- * into the final zzBuffer position, this will have a value of 1;
- * otherwise, it will have a value of 0.
- */
- private int zzFinalHighSurrogate = 0;
-
- /* user code: */
- /** Alphanumeric sequences */
- public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
-
- /** Numbers */
- public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
-
- /**
- * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
- * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
- * together as as a single token rather than broken up, because the logic
- * required to break them at word boundaries is too complex for UAX#29.
- * <p>
- * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
- */
- public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
-
- public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
-
- public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
-
- public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
-
- public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
-
- public final int yychar()
- {
- return yychar;
- }
-
- /**
- * Fills CharTermAttribute with the current token text.
- */
- public final void getText(CharTermAttribute t) {
- t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
- }
-
- /**
- * Sets the scanner buffer size in chars
- */
- public final void setBufferSize(int numChars) {
- ZZ_BUFFERSIZE = numChars;
- char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
- System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
- zzBuffer = newZzBuffer;
- }
-
-
- /**
- * Creates a new scanner
- *
- * @param in the java.io.Reader to read input from.
- */
- public StandardTokenizerImpl(java.io.Reader in) {
- this.zzReader = in;
- }
-
-
- /**
- * Unpacks the compressed character translation table.
- *
- * @param packed the packed character translation table
- * @return the unpacked character translation table
- */
- private static char [] zzUnpackCMap(String packed) {
- char [] map = new char[0x110000];
- int i = 0; /* index in packed string */
- int j = 0; /* index in unpacked array */
- while (i < 2836) {
- int count = packed.charAt(i++);
- char value = packed.charAt(i++);
- do map[j++] = value; while (--count > 0);
- }
- return map;
- }
-
-
- /**
- * Refills the input buffer.
- *
- * @return <code>false</code>, iff there was new input.
- *
- * @exception java.io.IOException if any I/O-Error occurs
- */
- private boolean zzRefill() throws java.io.IOException {
-
- /* first: make room (if you can) */
- if (zzStartRead > 0) {
- zzEndRead += zzFinalHighSurrogate;
- zzFinalHighSurrogate = 0;
- System.arraycopy(zzBuffer, zzStartRead,
- zzBuffer, 0,
- zzEndRead-zzStartRead);
-
- /* translate stored positions */
- zzEndRead-= zzStartRead;
- zzCurrentPos-= zzStartRead;
- zzMarkedPos-= zzStartRead;
- zzStartRead = 0;
- }
-
-
- /* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
- int totalRead = 0;
- while (totalRead < requested) {
- int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
- if (numRead == -1) {
- break;
- }
- totalRead += numRead;
- }
-
- if (totalRead > 0) {
- zzEndRead += totalRead;
- if (totalRead == requested) { /* possibly more input available */
- if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
- --zzEndRead;
- zzFinalHighSurrogate = 1;
- if (totalRead == 1) { return true; }
- }
- }
- return false;
- }
-
- // totalRead = 0: End of stream
- return true;
- }
-
-
- /**
- * Closes the input stream.
- */
- public final void yyclose() throws java.io.IOException {
- zzAtEOF = true; /* indicate end of file */
- zzEndRead = zzStartRead; /* invalidate buffer */
-
- if (zzReader != null)
- zzReader.close();
- }
-
-
- /**
- * Resets the scanner to read from a new input stream.
- * Does not close the old reader.
- *
- * All internal variables are reset, the old input stream
- * <b>cannot</b> be reused (internal buffer is discarded and lost).
- * Lexical state is set to <tt>ZZ_INITIAL</tt>.
- *
- * Internal scan buffer is resized down to its initial length, if it has grown.
- *
- * @param reader the new input stream
- */
- public final void yyreset(java.io.Reader reader) {
- zzReader = reader;
- zzAtBOL = true;
- zzAtEOF = false;
- zzEOFDone = false;
- zzEndRead = zzStartRead = 0;
- zzCurrentPos = zzMarkedPos = 0;
- zzFinalHighSurrogate = 0;
- yyline = yychar = yycolumn = 0;
- zzLexicalState = YYINITIAL;
- if (zzBuffer.length > ZZ_BUFFERSIZE)
- zzBuffer = new char[ZZ_BUFFERSIZE];
- }
-
-
- /**
- * Returns the current lexical state.
- */
- public final int yystate() {
- return zzLexicalState;
- }
-
-
- /**
- * Enters a new lexical state
- *
- * @param newState the new lexical state
- */
- public final void yybegin(int newState) {
- zzLexicalState = newState;
- }
-
-
- /**
- * Returns the text matched by the current regular expression.
- */
- public final String yytext() {
- return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
- }
-
-
- /**
- * Returns the character at position <tt>pos</tt> from the
- * matched text.
- *
- * It is equivalent to yytext().charAt(pos), but faster
- *
- * @param pos the position of the character to fetch.
- * A value from 0 to yylength()-1.
- *
- * @return the character at position pos
- */
- public final char yycharat(int pos) {
- return zzBuffer[zzStartRead+pos];
- }
-
-
- /**
- * Returns the length of the matched text region.
- */
- public final int yylength() {
- return zzMarkedPos-zzStartRead;
- }
-
-
- /**
- * Reports an error that occured while scanning.
- *
- * In a wellformed scanner (no or only correct usage of
- * yypushback(int) and a match-all fallback rule) this method
- * will only be called with things that "Can't Possibly Happen".
- * If this method is called, something is seriously wrong
- * (e.g. a JFlex bug producing a faulty scanner etc.).
- *
- * Usual syntax/scanner level error handling should be done
- * in error fallback rules.
- *
- * @param errorCode the code of the errormessage to display
- */
- private void zzScanError(int errorCode) {
- String message;
- try {
- message = ZZ_ERROR_MSG[errorCode];
- }
- catch (ArrayIndexOutOfBoundsException e) {
- message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
- }
-
- throw new Error(message);
- }
-
-
- /**
- * Pushes the specified amount of characters back into the input stream.
- *
- * They will be read again by then next call of the scanning method
- *
- * @param number the number of characters to be read again.
- * This number must not be greater than yylength()!
- */
- public void yypushback(int number) {
- if ( number > yylength() )
- zzScanError(ZZ_PUSHBACK_2BIG);
-
- zzMarkedPos -= number;
- }
-
-
- /**
- * Resumes scanning until the next regular expression is matched,
- * the end of input is encountered or an I/O-Error occurs.
- *
- * @return the next token
- * @exception java.io.IOException if any I/O-Error occurs
- */
- public int getNextToken() throws java.io.IOException {
- int zzInput;
- int zzAction;
-
- // cached fields:
- int zzCurrentPosL;
- int zzMarkedPosL;
- int zzEndReadL = zzEndRead;
- char [] zzBufferL = zzBuffer;
- char [] zzCMapL = ZZ_CMAP;
-
- int [] zzTransL = ZZ_TRANS;
- int [] zzRowMapL = ZZ_ROWMAP;
- int [] zzAttrL = ZZ_ATTRIBUTE;
-
- while (true) {
- zzMarkedPosL = zzMarkedPos;
-
- yychar+= zzMarkedPosL-zzStartRead;
-
- zzAction = -1;
-
- zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
-
- zzState = ZZ_LEXSTATE[zzLexicalState];
-
- // set up zzAction for empty match case:
- int zzAttributes = zzAttrL[zzState];
- if ( (zzAttributes & 1) == 1 ) {
- zzAction = zzState;
- }
-
-
- zzForAction: {
- while (true) {
-
- if (zzCurrentPosL < zzEndReadL) {
- zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
- zzCurrentPosL += Character.charCount(zzInput);
- }
- else if (zzAtEOF) {
- zzInput = YYEOF;
- break zzForAction;
- }
- else {
- // store back cached positions
- zzCurrentPos = zzCurrentPosL;
- zzMarkedPos = zzMarkedPosL;
- boolean eof = zzRefill();
- // get translated positions and possibly new buffer
- zzCurrentPosL = zzCurrentPos;
- zzMarkedPosL = zzMarkedPos;
- zzBufferL = zzBuffer;
- zzEndReadL = zzEndRead;
- if (eof) {
- zzInput = YYEOF;
- break zzForAction;
- }
- else {
- zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
- zzCurrentPosL += Character.charCount(zzInput);
- }
- }
- int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
- if (zzNext == -1) break zzForAction;
- zzState = zzNext;
-
- zzAttributes = zzAttrL[zzState];
- if ( (zzAttributes & 1) == 1 ) {
- zzAction = zzState;
- zzMarkedPosL = zzCurrentPosL;
- if ( (zzAttributes & 8) == 8 ) break zzForAction;
- }
-
- }
- }
-
- // store back cached position
- zzMarkedPos = zzMarkedPosL;
-
- switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
- }
- case 9: break;
- case 2:
- { return WORD_TYPE;
- }
- case 10: break;
- case 3:
- { return HANGUL_TYPE;
- }
- case 11: break;
- case 4:
- { return NUMERIC_TYPE;
- }
- case 12: break;
- case 5:
- { return KATAKANA_TYPE;
- }
- case 13: break;
- case 6:
- { return IDEOGRAPHIC_TYPE;
- }
- case 14: break;
- case 7:
- { return HIRAGANA_TYPE;
- }
- case 15: break;
- case 8:
- { return SOUTH_EAST_ASIAN_TYPE;
- }
- case 16: break;
- default:
- if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
- zzAtEOF = true;
- {
- return YYEOF;
- }
- }
- else {
- zzScanError(ZZ_NO_MATCH);
- }
- }
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
deleted file mode 100644
index 34f4ead..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-/**
- * This class implements Word Break rules from the Unicode Text Segmentation
- * algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * <p>
- * Tokens produced are of the following types:
- * <ul>
- * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
- * <li><NUM>: A number</li>
- * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
- * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
- * <li><HIRAGANA>: A single hiragana character</li>
- * <li><KATAKANA>: A sequence of katakana characters</li>
- * <li><HANGUL>: A sequence of Hangul characters</li>
- * </ul>
- */
-@SuppressWarnings("fallthrough")
-%%
-
-%unicode 6.3
-%integer
-%final
-%public
-%class StandardTokenizerImpl
-%function getNextToken
-%char
-%buffer 255
-
-// UAX#29 WB4. X (Extend | Format)* --> X
-//
-HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
-NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
-MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
-HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
-
-%{
- /** Alphanumeric sequences */
- public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
-
- /** Numbers */
- public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
-
- /**
- * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
- * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
- * together as as a single token rather than broken up, because the logic
- * required to break them at word boundaries is too complex for UAX#29.
- * <p>
- * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
- */
- public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
-
- public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
-
- public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
-
- public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
-
- public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
-
- public final int yychar()
- {
- return yychar;
- }
-
- /**
- * Fills CharTermAttribute with the current token text.
- */
- public final void getText(CharTermAttribute t) {
- t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
- }
-
- /**
- * Sets the scanner buffer size in chars
- */
- public final void setBufferSize(int numChars) {
- ZZ_BUFFERSIZE = numChars;
- char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
- System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
- zzBuffer = newZzBuffer;
- }
-%}
-
-%%
-
-// UAX#29 WB1. sot �
-// WB2. � eot
-//
-<<EOF>> { return YYEOF; }
-
-// UAX#29 WB8. Numeric � Numeric
-// WB11. Numeric (MidNum | MidNumLet | Single_Quote) � Numeric
-// WB12. Numeric � (MidNum | MidNumLet | Single_Quote) Numeric
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
-// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
- { return NUMERIC_TYPE; }
-
-// subset of the below for typing purposes only!
-{HangulEx}+
- { return HANGUL_TYPE; }
-
-{KatakanaEx}+
- { return KATAKANA_TYPE; }
-
-// UAX#29 WB5. (ALetter | Hebrew_Letter) � (ALetter | Hebrew_Letter)
-// WB6. (ALetter | Hebrew_Letter) � (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) � (ALetter | Hebrew_Letter)
-// WB7a. Hebrew_Letter � Single_Quote
-// WB7b. Hebrew_Letter � Double_Quote Hebrew_Letter
-// WB7c. Hebrew_Letter Double_Quote � Hebrew_Letter
-// WB9. (ALetter | Hebrew_Letter) � Numeric
-// WB10. Numeric � (ALetter | Hebrew_Letter)
-// WB13. Katakana � Katakana
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
-// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
- )+
- )
-({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
- )+
- )
-)*
-{ExtendNumLetEx}*
- { return WORD_TYPE; }
-
-
-// From UAX #29:
-//
-// [C]haracters with the Line_Break property values of Contingent_Break (CB),
-// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
-// boundary property values based on criteria outside of the scope of this
-// annex. That means that satisfactory treatment of languages like Chinese
-// or Thai requires special handling.
-//
-// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
-// property: U+FFFC ( \ufffc ) OBJECT REPLACEMENT CHARACTER.
-//
-// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
-// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
-// Lao, etc.) are kept together. This grammar does the same below.
-//
-// See also the Unicode Line Breaking Algorithm:
-//
-// http://www.unicode.org/reports/tr14/#SA
-//
-{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
-
-// UAX#29 WB14. Any � Any
-//
-{HanEx} { return IDEOGRAPHIC_TYPE; }
-{HiraganaEx} { return HIRAGANA_TYPE; }
-
-
-// UAX#29 WB3. CR � LF
-// WB3a. (Newline | CR | LF) �
-// WB3b. � (Newline | CR | LF)
-// WB13c. Regional_Indicator � Regional_Indicator
-// WB14. Any � Any
-//
-{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
index 1fc2d7c..9994884 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@@ -20,18 +20,18 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* Filters {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer}
* with {@link org.apache.lucene.analysis.standard.StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and
- * {@link org.apache.lucene.analysis.core.StopFilter}, using a list of
+ * {@link org.apache.lucene.analysis.LowerCaseFilter} and
+ * {@link org.apache.lucene.analysis.StopFilter}, using a list of
* English stop words.
*/
public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
@@ -59,7 +59,7 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
}
/** Builds an analyzer with the stop words from the given reader.
- * @see org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader)
+ * @see org.apache.lucene.analysis.WordlistLoader#getWordSet(java.io.Reader)
* @param stopwords Reader to read stop words from */
public UAX29URLEmailAnalyzer(Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords));
[02/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
new file mode 100644
index 0000000..4a3731e
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
@@ -0,0 +1,5537 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Ignore;
+
+/**
+ * This class was automatically generated by generateJavaUnicodeWordBreakTest.pl
+ * from: http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakTest.txt
+ *
+ * WordBreakTest.txt indicates the points in the provided character sequences
+ * at which conforming implementations must and must not break words. This
+ * class tests for expected token extraction from each of the test sequences
+ * in WordBreakTest.txt, where the expected tokens are those character
+ * sequences bounded by word breaks and containing at least one character
+ * from one of the following character sets:
+ *
+ * \p{Script = Han} (From http://www.unicode.org/Public/6.3.0/ucd/Scripts.txt)
+ * \p{Script = Hiragana}
+ * \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.3.0/ucd/LineBreak.txt)
+ * \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt)
+ * \p{WordBreak = Hebrew_Letter}
+ * \p{WordBreak = Katakana}
+ * \p{WordBreak = Numeric} (Excludes full-width Arabic digits)
+ * [\uFF10-\uFF19] (Full-width Arabic digits)
+ */
+@Ignore
+public class WordBreakTestUnicode_6_3_0 extends BaseTokenStreamTestCase {
+
+ public void test(Analyzer analyzer) throws Exception {
+ // � 0001 � 0001 � # � [0.2] <START OF HEADING> (Other) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0001",
+ new String[] { });
+
+ // � 0001 � 0308 � 0001 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0001",
+ new String[] { });
+
+ // � 0001 � 000D � # � [0.2] <START OF HEADING> (Other) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\r",
+ new String[] { });
+
+ // � 0001 � 0308 � 000D � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\r",
+ new String[] { });
+
+ // � 0001 � 000A � # � [0.2] <START OF HEADING> (Other) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\n",
+ new String[] { });
+
+ // � 0001 � 0308 � 000A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\n",
+ new String[] { });
+
+ // � 0001 � 000B � # � [0.2] <START OF HEADING> (Other) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u000B",
+ new String[] { });
+
+ // � 0001 � 0308 � 000B � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u000B",
+ new String[] { });
+
+ // � 0001 � 3031 � # � [0.2] <START OF HEADING> (Other) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u3031",
+ new String[] { "\u3031" });
+
+ // � 0001 � 0308 � 3031 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u3031",
+ new String[] { "\u3031" });
+
+ // � 0001 � 0041 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0041",
+ new String[] { "\u0041" });
+
+ // � 0001 � 0308 � 0041 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0041",
+ new String[] { "\u0041" });
+
+ // � 0001 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u003A",
+ new String[] { });
+
+ // � 0001 � 0308 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u003A",
+ new String[] { });
+
+ // � 0001 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u002C",
+ new String[] { });
+
+ // � 0001 � 0308 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u002C",
+ new String[] { });
+
+ // � 0001 � 002E � # � [0.2] <START OF HEADING> (Other) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u002E",
+ new String[] { });
+
+ // � 0001 � 0308 � 002E � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u002E",
+ new String[] { });
+
+ // � 0001 � 0030 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0030",
+ new String[] { "\u0030" });
+
+ // � 0001 � 0308 � 0030 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0030",
+ new String[] { "\u0030" });
+
+ // � 0001 � 005F � # � [0.2] <START OF HEADING> (Other) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u005F",
+ new String[] { });
+
+ // � 0001 � 0308 � 005F � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u005F",
+ new String[] { });
+
+ // � 0001 � 1F1E6 � # � [0.2] <START OF HEADING> (Other) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\uD83C\uDDE6",
+ new String[] { });
+
+ // � 0001 � 0308 � 1F1E6 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\uD83C\uDDE6",
+ new String[] { });
+
+ // � 0001 � 05D0 � # � [0.2] <START OF HEADING> (Other) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 0001 � 0308 � 05D0 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 0001 � 0022 � # � [0.2] <START OF HEADING> (Other) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\"",
+ new String[] { });
+
+ // � 0001 � 0308 � 0022 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\"",
+ new String[] { });
+
+ // � 0001 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0027",
+ new String[] { });
+
+ // � 0001 � 0308 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0027",
+ new String[] { });
+
+ // � 0001 � 00AD � # � [0.2] <START OF HEADING> (Other) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u00AD",
+ new String[] { });
+
+ // � 0001 � 0308 � 00AD � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u00AD",
+ new String[] { });
+
+ // � 0001 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0300",
+ new String[] { });
+
+ // � 0001 � 0308 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0300",
+ new String[] { });
+
+ // � 0001 � 0061 � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 0001 � 0308 � 0061 � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 0001 � 0061 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0308 � 0061 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0061 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0308 � 0061 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0061 � 0027 � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0308 � 0061 � 0027 � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0061 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0308 � 0061 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 0001 � 0031 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0308 � 0031 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0031 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0308 � 0031 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0031 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0308 � 0031 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0031 � 002E � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 0001 � 0308 � 0031 � 002E � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000D � 0001 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0001",
+ new String[] { });
+
+ // � 000D � 0308 � 0001 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0001",
+ new String[] { });
+
+ // � 000D � 000D � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\r",
+ new String[] { });
+
+ // � 000D � 0308 � 000D � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\r",
+ new String[] { });
+
+ // � 000D � 000A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.0] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\n",
+ new String[] { });
+
+ // � 000D � 0308 � 000A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\n",
+ new String[] { });
+
+ // � 000D � 000B � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u000B",
+ new String[] { });
+
+ // � 000D � 0308 � 000B � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u000B",
+ new String[] { });
+
+ // � 000D � 3031 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u3031",
+ new String[] { "\u3031" });
+
+ // � 000D � 0308 � 3031 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u3031",
+ new String[] { "\u3031" });
+
+ // � 000D � 0041 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0041",
+ new String[] { "\u0041" });
+
+ // � 000D � 0308 � 0041 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0041",
+ new String[] { "\u0041" });
+
+ // � 000D � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u003A",
+ new String[] { });
+
+ // � 000D � 0308 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u003A",
+ new String[] { });
+
+ // � 000D � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u002C",
+ new String[] { });
+
+ // � 000D � 0308 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u002C",
+ new String[] { });
+
+ // � 000D � 002E � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u002E",
+ new String[] { });
+
+ // � 000D � 0308 � 002E � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u002E",
+ new String[] { });
+
+ // � 000D � 0030 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0030",
+ new String[] { "\u0030" });
+
+ // � 000D � 0308 � 0030 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0030",
+ new String[] { "\u0030" });
+
+ // � 000D � 005F � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u005F",
+ new String[] { });
+
+ // � 000D � 0308 � 005F � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u005F",
+ new String[] { });
+
+ // � 000D � 1F1E6 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000D � 0308 � 1F1E6 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000D � 05D0 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000D � 0308 � 05D0 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000D � 0022 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\"",
+ new String[] { });
+
+ // � 000D � 0308 � 0022 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\"",
+ new String[] { });
+
+ // � 000D � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0027",
+ new String[] { });
+
+ // � 000D � 0308 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0027",
+ new String[] { });
+
+ // � 000D � 00AD � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u00AD",
+ new String[] { });
+
+ // � 000D � 0308 � 00AD � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u00AD",
+ new String[] { });
+
+ // � 000D � 0300 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0300",
+ new String[] { });
+
+ // � 000D � 0308 � 0300 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0300",
+ new String[] { });
+
+ // � 000D � 0061 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000D � 0308 � 0061 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000D � 0061 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000D � 0308 � 0061 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000D � 0061 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000D � 0308 � 0061 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000D � 0061 � 0027 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000D � 0308 � 0061 � 0027 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000D � 0061 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000D � 0308 � 0061 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000D � 0031 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000D � 0308 � 0031 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000D � 0031 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000D � 0308 � 0031 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000D � 0031 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000D � 0308 � 0031 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000D � 0031 � 002E � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000D � 0308 � 0031 � 002E � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\r\u0308\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000A � 0001 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0001",
+ new String[] { });
+
+ // � 000A � 0308 � 0001 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0001",
+ new String[] { });
+
+ // � 000A � 000D � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\r",
+ new String[] { });
+
+ // � 000A � 0308 � 000D � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\r",
+ new String[] { });
+
+ // � 000A � 000A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\n",
+ new String[] { });
+
+ // � 000A � 0308 � 000A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\n",
+ new String[] { });
+
+ // � 000A � 000B � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u000B",
+ new String[] { });
+
+ // � 000A � 0308 � 000B � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u000B",
+ new String[] { });
+
+ // � 000A � 3031 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u3031",
+ new String[] { "\u3031" });
+
+ // � 000A � 0308 � 3031 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u3031",
+ new String[] { "\u3031" });
+
+ // � 000A � 0041 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0041",
+ new String[] { "\u0041" });
+
+ // � 000A � 0308 � 0041 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0041",
+ new String[] { "\u0041" });
+
+ // � 000A � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u003A",
+ new String[] { });
+
+ // � 000A � 0308 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u003A",
+ new String[] { });
+
+ // � 000A � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u002C",
+ new String[] { });
+
+ // � 000A � 0308 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u002C",
+ new String[] { });
+
+ // � 000A � 002E � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u002E",
+ new String[] { });
+
+ // � 000A � 0308 � 002E � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u002E",
+ new String[] { });
+
+ // � 000A � 0030 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0030",
+ new String[] { "\u0030" });
+
+ // � 000A � 0308 � 0030 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0030",
+ new String[] { "\u0030" });
+
+ // � 000A � 005F � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u005F",
+ new String[] { });
+
+ // � 000A � 0308 � 005F � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u005F",
+ new String[] { });
+
+ // � 000A � 1F1E6 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000A � 0308 � 1F1E6 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000A � 05D0 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000A � 0308 � 05D0 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000A � 0022 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\"",
+ new String[] { });
+
+ // � 000A � 0308 � 0022 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\"",
+ new String[] { });
+
+ // � 000A � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0027",
+ new String[] { });
+
+ // � 000A � 0308 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0027",
+ new String[] { });
+
+ // � 000A � 00AD � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u00AD",
+ new String[] { });
+
+ // � 000A � 0308 � 00AD � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u00AD",
+ new String[] { });
+
+ // � 000A � 0300 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0300",
+ new String[] { });
+
+ // � 000A � 0308 � 0300 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0300",
+ new String[] { });
+
+ // � 000A � 0061 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000A � 0308 � 0061 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000A � 0061 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000A � 0308 � 0061 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000A � 0061 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000A � 0308 � 0061 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000A � 0061 � 0027 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000A � 0308 � 0061 � 0027 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000A � 0061 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000A � 0308 � 0061 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000A � 0031 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000A � 0308 � 0031 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000A � 0031 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000A � 0308 � 0031 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000A � 0031 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000A � 0308 � 0031 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000A � 0031 � 002E � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000A � 0308 � 0031 � 002E � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\n\u0308\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000B � 0001 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0001",
+ new String[] { });
+
+ // � 000B � 0308 � 0001 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0001",
+ new String[] { });
+
+ // � 000B � 000D � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\r",
+ new String[] { });
+
+ // � 000B � 0308 � 000D � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\r",
+ new String[] { });
+
+ // � 000B � 000A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\n",
+ new String[] { });
+
+ // � 000B � 0308 � 000A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\n",
+ new String[] { });
+
+ // � 000B � 000B � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u000B",
+ new String[] { });
+
+ // � 000B � 0308 � 000B � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u000B",
+ new String[] { });
+
+ // � 000B � 3031 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u3031",
+ new String[] { "\u3031" });
+
+ // � 000B � 0308 � 3031 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u3031",
+ new String[] { "\u3031" });
+
+ // � 000B � 0041 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0041",
+ new String[] { "\u0041" });
+
+ // � 000B � 0308 � 0041 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0041",
+ new String[] { "\u0041" });
+
+ // � 000B � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u003A",
+ new String[] { });
+
+ // � 000B � 0308 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u003A",
+ new String[] { });
+
+ // � 000B � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u002C",
+ new String[] { });
+
+ // � 000B � 0308 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u002C",
+ new String[] { });
+
+ // � 000B � 002E � # � [0.2] <LINE TABULATION> (Newline) � [3.1] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u002E",
+ new String[] { });
+
+ // � 000B � 0308 � 002E � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u002E",
+ new String[] { });
+
+ // � 000B � 0030 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0030",
+ new String[] { "\u0030" });
+
+ // � 000B � 0308 � 0030 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0030",
+ new String[] { "\u0030" });
+
+ // � 000B � 005F � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u005F",
+ new String[] { });
+
+ // � 000B � 0308 � 005F � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u005F",
+ new String[] { });
+
+ // � 000B � 1F1E6 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000B � 0308 � 1F1E6 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\uD83C\uDDE6",
+ new String[] { });
+
+ // � 000B � 05D0 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000B � 0308 � 05D0 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 000B � 0022 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\"",
+ new String[] { });
+
+ // � 000B � 0308 � 0022 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\"",
+ new String[] { });
+
+ // � 000B � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0027",
+ new String[] { });
+
+ // � 000B � 0308 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0027",
+ new String[] { });
+
+ // � 000B � 00AD � # � [0.2] <LINE TABULATION> (Newline) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u00AD",
+ new String[] { });
+
+ // � 000B � 0308 � 00AD � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u00AD",
+ new String[] { });
+
+ // � 000B � 0300 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0300",
+ new String[] { });
+
+ // � 000B � 0308 � 0300 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0300",
+ new String[] { });
+
+ // � 000B � 0061 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000B � 0308 � 0061 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u2060",
+ new String[] { "\u0061\u2060" });
+
+ // � 000B � 0061 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000B � 0308 � 0061 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u003A",
+ new String[] { "\u0061" });
+
+ // � 000B � 0061 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000B � 0308 � 0061 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u0027",
+ new String[] { "\u0061" });
+
+ // � 000B � 0061 � 0027 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000B � 0308 � 0061 � 0027 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u0027\u2060",
+ new String[] { "\u0061" });
+
+ // � 000B � 0061 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000B � 0308 � 0061 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u002C",
+ new String[] { "\u0061" });
+
+ // � 000B � 0031 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000B � 0308 � 0031 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u003A",
+ new String[] { "\u0031" });
+
+ // � 000B � 0031 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000B � 0308 � 0031 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u0027",
+ new String[] { "\u0031" });
+
+ // � 000B � 0031 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000B � 0308 � 0031 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u002C",
+ new String[] { "\u0031" });
+
+ // � 000B � 0031 � 002E � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 000B � 0308 � 0031 � 002E � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u002E\u2060",
+ new String[] { "\u0031" });
+
+ // � 3031 � 0001 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0001",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 0001 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0001",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 000D � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\r",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 000D � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\r",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 000A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\n",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 000A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\n",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 000B � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u000B",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 000B � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u000B",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 3031 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [13.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u3031",
+ new String[] { "\u3031\u3031" });
+
+ // � 3031 � 0308 � 3031 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u3031",
+ new String[] { "\u3031\u0308\u3031" });
+
+ // � 3031 � 0041 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0041",
+ new String[] { "\u3031", "\u0041" });
+
+ // � 3031 � 0308 � 0041 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0041",
+ new String[] { "\u3031\u0308", "\u0041" });
+
+ // � 3031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u003A",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u003A",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u002C",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u002C",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 002E � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u002E",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 002E � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u002E",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 0030 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0030",
+ new String[] { "\u3031", "\u0030" });
+
+ // � 3031 � 0308 � 0030 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0030",
+ new String[] { "\u3031\u0308", "\u0030" });
+
+ // � 3031 � 005F � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u005F",
+ new String[] { "\u3031\u005F" });
+
+ // � 3031 � 0308 � 005F � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u005F",
+ new String[] { "\u3031\u0308\u005F" });
+
+ // � 3031 � 1F1E6 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\uD83C\uDDE6",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 1F1E6 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\uD83C\uDDE6",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 05D0 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u05D0",
+ new String[] { "\u3031", "\u05D0" });
+
+ // � 3031 � 0308 � 05D0 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u05D0",
+ new String[] { "\u3031\u0308", "\u05D0" });
+
+ // � 3031 � 0022 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\"",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 0022 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\"",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0027",
+ new String[] { "\u3031" });
+
+ // � 3031 � 0308 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0027",
+ new String[] { "\u3031\u0308" });
+
+ // � 3031 � 00AD � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u00AD",
+ new String[] { "\u3031\u00AD" });
+
+ // � 3031 � 0308 � 00AD � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u00AD",
+ new String[] { "\u3031\u0308\u00AD" });
+
+ // � 3031 � 0300 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0300",
+ new String[] { "\u3031\u0300" });
+
+ // � 3031 � 0308 � 0300 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0300",
+ new String[] { "\u3031\u0308\u0300" });
+
+ // � 3031 � 0061 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0061\u2060",
+ new String[] { "\u3031", "\u0061\u2060" });
+
+ // � 3031 � 0308 � 0061 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u2060",
+ new String[] { "\u3031\u0308", "\u0061\u2060" });
+
+ // � 3031 � 0061 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0061\u003A",
+ new String[] { "\u3031", "\u0061" });
+
+ // � 3031 � 0308 � 0061 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u003A",
+ new String[] { "\u3031\u0308", "\u0061" });
+
+ // � 3031 � 0061 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0061\u0027",
+ new String[] { "\u3031", "\u0061" });
+
+ // � 3031 � 0308 � 0061 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u0027",
+ new String[] { "\u3031\u0308", "\u0061" });
+
+ // � 3031 � 0061 � 0027 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0061\u0027\u2060",
+ new String[] { "\u3031", "\u0061" });
+
+ // � 3031 � 0308 � 0061 � 0027 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u0027\u2060",
+ new String[] { "\u3031\u0308", "\u0061" });
+
+ // � 3031 � 0061 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0061\u002C",
+ new String[] { "\u3031", "\u0061" });
+
+ // � 3031 � 0308 � 0061 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u002C",
+ new String[] { "\u3031\u0308", "\u0061" });
+
+ // � 3031 � 0031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0031\u003A",
+ new String[] { "\u3031", "\u0031" });
+
+ // � 3031 � 0308 � 0031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u003A",
+ new String[] { "\u3031\u0308", "\u0031" });
+
+ // � 3031 � 0031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0031\u0027",
+ new String[] { "\u3031", "\u0031" });
+
+ // � 3031 � 0308 � 0031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u0027",
+ new String[] { "\u3031\u0308", "\u0031" });
+
+ // � 3031 � 0031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0031\u002C",
+ new String[] { "\u3031", "\u0031" });
+
+ // � 3031 � 0308 � 0031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u002C",
+ new String[] { "\u3031\u0308", "\u0031" });
+
+ // � 3031 � 0031 � 002E � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0031\u002E\u2060",
+ new String[] { "\u3031", "\u0031" });
+
+ // � 3031 � 0308 � 0031 � 002E � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u002E\u2060",
+ new String[] { "\u3031\u0308", "\u0031" });
+
+ // � 0041 � 0001 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0001",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 0001 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0001",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 000D � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\r",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 000D � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\r",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 000A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\n",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 000A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\n",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 000B � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u000B",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 000B � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u000B",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 3031 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u3031",
+ new String[] { "\u0041", "\u3031" });
+
+ // � 0041 � 0308 � 3031 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u3031",
+ new String[] { "\u0041\u0308", "\u3031" });
+
+ // � 0041 � 0041 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0041",
+ new String[] { "\u0041\u0041" });
+
+ // � 0041 � 0308 � 0041 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0041",
+ new String[] { "\u0041\u0308\u0041" });
+
+ // � 0041 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u003A",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u003A",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u002C",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u002C",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 002E � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u002E",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 002E � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u002E",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 0030 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0030",
+ new String[] { "\u0041\u0030" });
+
+ // � 0041 � 0308 � 0030 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0030",
+ new String[] { "\u0041\u0308\u0030" });
+
+ // � 0041 � 005F � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u005F",
+ new String[] { "\u0041\u005F" });
+
+ // � 0041 � 0308 � 005F � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u005F",
+ new String[] { "\u0041\u0308\u005F" });
+
+ // � 0041 � 1F1E6 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\uD83C\uDDE6",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 1F1E6 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\uD83C\uDDE6",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 05D0 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u05D0",
+ new String[] { "\u0041\u05D0" });
+
+ // � 0041 � 0308 � 05D0 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u05D0",
+ new String[] { "\u0041\u0308\u05D0" });
+
+ // � 0041 � 0022 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\"",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 0022 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\"",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0027",
+ new String[] { "\u0041" });
+
+ // � 0041 � 0308 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0027",
+ new String[] { "\u0041\u0308" });
+
+ // � 0041 � 00AD � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u00AD",
+ new String[] { "\u0041\u00AD" });
+
+ // � 0041 � 0308 � 00AD � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u00AD",
+ new String[] { "\u0041\u0308\u00AD" });
+
+ // � 0041 � 0300 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0300",
+ new String[] { "\u0041\u0300" });
+
+ // � 0041 � 0308 � 0300 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0300",
+ new String[] { "\u0041\u0308\u0300" });
+
+ // � 0041 � 0061 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0061\u2060",
+ new String[] { "\u0041\u0061\u2060" });
+
+ // � 0041 � 0308 � 0061 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u2060",
+ new String[] { "\u0041\u0308\u0061\u2060" });
+
+ // � 0041 � 0061 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0061\u003A",
+ new String[] { "\u0041\u0061" });
+
+ // � 0041 � 0308 � 0061 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u003A",
+ new String[] { "\u0041\u0308\u0061" });
+
+ // � 0041 � 0061 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0061\u0027",
+ new String[] { "\u0041\u0061" });
+
+ // � 0041 � 0308 � 0061 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u0027",
+ new String[] { "\u0041\u0308\u0061" });
+
+ // � 0041 � 0061 � 0027 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0061\u0027\u2060",
+ new String[] { "\u0041\u0061" });
+
+ // � 0041 � 0308 � 0061 � 0027 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u0027\u2060",
+ new String[] { "\u0041\u0308\u0061" });
+
+ // � 0041 � 0061 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0061\u002C",
+ new String[] { "\u0041\u0061" });
+
+ // � 0041 � 0308 � 0061 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u002C",
+ new String[] { "\u0041\u0308\u0061" });
+
+ // � 0041 � 0031 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0031\u003A",
+ new String[] { "\u0041\u0031" });
+
+ // � 0041 � 0308 � 0031 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u003A",
+ new String[] { "\u0041\u0308\u0031" });
+
+ // � 0041 � 0031 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0031\u0027",
+ new String[] { "\u0041\u0031" });
+
+ // � 0041 � 0308 � 0031 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u0027",
+ new String[] { "\u0041\u0308\u0031" });
+
+ // � 0041 � 0031 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0031\u002C",
+ new String[] { "\u0041\u0031" });
+
+ // � 0041 � 0308 � 0031 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u002C",
+ new String[] { "\u0041\u0308\u0031" });
+
+ // � 0041 � 0031 � 002E � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0031\u002E\u2060",
+ new String[] { "\u0041\u0031" });
+
+ // � 0041 � 0308 � 0031 � 002E � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u002E\u2060",
+ new String[] { "\u0041\u0308\u0031" });
+
+ // � 003A � 0001 � # � [0.2] COLON (MidLetter) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0001",
+ new String[] { });
+
+ // � 003A � 0308 � 0001 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u0001",
+ new String[] { });
+
+ // � 003A � 000D � # � [0.2] COLON (MidLetter) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\r",
+ new String[] { });
+
+ // � 003A � 0308 � 000D � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\r",
+ new String[] { });
+
+ // � 003A � 000A � # � [0.2] COLON (MidLetter) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\n",
+ new String[] { });
+
+ // � 003A � 0308 � 000A � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\n",
+ new String[] { });
+
+ // � 003A � 000B � # � [0.2] COLON (MidLetter) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u000B",
+ new String[] { });
+
+ // � 003A � 0308 � 000B � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u000B",
+ new String[] { });
+
+ // � 003A � 3031 � # � [0.2] COLON (MidLetter) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u3031",
+ new String[] { "\u3031" });
+
+ // � 003A � 0308 � 3031 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u3031",
+ new String[] { "\u3031" });
+
+ // � 003A � 0041 � # � [0.2] COLON (MidLetter) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0041",
+ new String[] { "\u0041" });
+
+ // � 003A � 0308 � 0041 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u0041",
+ new String[] { "\u0041" });
+
+ // � 003A � 003A � # � [0.2] COLON (MidLetter) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u003A",
+ new String[] { });
+
+ // � 003A � 0308 � 003A � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u003A",
+ new String[] { });
+
+ // � 003A � 002C � # � [0.2] COLON (MidLetter) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u002C",
+ new String[] { });
+
+ // � 003A � 0308 � 002C � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u002C",
+ new String[] { });
+
+ // � 003A � 002E � # � [0.2] COLON (MidLetter) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u002E",
+ new String[] { });
+
+ // � 003A � 0308 � 002E � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u002E",
+ new String[] { });
+
+ // � 003A � 0030 � # � [0.2] COLON (MidLetter) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0030",
+ new String[] { "\u0030" });
+
+ // � 003A � 0308 � 0030 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u0030",
+ new String[] { "\u0030" });
+
+ // � 003A � 005F � # � [0.2] COLON (MidLetter) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u005F",
+ new String[] { });
+
+ // � 003A � 0308 � 005F � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u005F",
+ new String[] { });
+
+ // � 003A � 1F1E6 � # � [0.2] COLON (MidLetter) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\uD83C\uDDE6",
+ new String[] { });
+
+ // � 003A � 0308 � 1F1E6 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\uD83C\uDDE6",
+ new String[] { });
+
+ // � 003A � 05D0 � # � [0.2] COLON (MidLetter) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 003A � 0308 � 05D0 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u05D0",
+ new String[] { "\u05D0" });
+
+ // � 003A � 0022 � # � [0.2] COLON (MidLetter) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\"",
+ new String[] { });
+
+ // � 003A � 0308 � 0022 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\"",
+ new String[] { });
+
+ // � 003A � 0027 � # � [0.2] COLON (MidLetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0027",
+ new String[] { });
+
+ // � 003A � 0308 � 0027 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u0027",
+ new String[] { });
+
+ // � 003A � 00AD � # � [0.2] COLON (MidLetter) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u00AD",
+ new String[] { });
+
+ // � 003A � 0308 � 00AD � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u00AD",
+ new String[] { });
+
+ // � 003A � 0300 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0300",
+ new String[] { });
+
+ // � 003A � 0308 � 0300 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
+ assertAnalyzesTo(analyzer, "\u003A\u0308\u0300",
+ new String[
<TRUNCATED>
[09/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
index 9772203..8f7f2cd 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@@ -22,6 +22,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
index 783811a..1d17237 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
@@ -25,6 +25,7 @@ import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockTokenFilter;
@@ -39,7 +40,6 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index bf02ccd..4effc79 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -50,6 +50,8 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.CrankyTokenFilter;
import org.apache.lucene.analysis.MockGraphTokenFilter;
@@ -73,8 +75,8 @@ import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
@@ -83,8 +85,6 @@ import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.snowball.TestSnowball;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.AttributeFactory;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
index f7552c8..bbf9502 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
@@ -17,16 +17,16 @@
package org.apache.lucene.analysis.core;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-
import java.io.IOException;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
-import java.util.HashSet;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestStopAnalyzer extends BaseTokenStreamTestCase {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
deleted file mode 100644
index 25b89d9..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.English;
-
-public class TestStopFilter extends BaseTokenStreamTestCase {
-
- // other StopFilter functionality is already tested by TestStopAnalyzer
-
- public void testExactCase() throws IOException {
- StringReader reader = new StringReader("Now is The Time");
- CharArraySet stopWords = new CharArraySet(asSet("is", "the", "Time"), false);
- final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- in.setReader(reader);
- TokenStream stream = new StopFilter(in, stopWords);
- assertTokenStreamContents(stream, new String[] { "Now", "The" });
- }
-
- public void testStopFilt() throws IOException {
- StringReader reader = new StringReader("Now is The Time");
- String[] stopWords = new String[] { "is", "the", "Time" };
- CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
- final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- in.setReader(reader);
- TokenStream stream = new StopFilter(in, stopSet);
- assertTokenStreamContents(stream, new String[] { "Now", "The" });
- }
-
- /**
- * Test Position increments applied by StopFilter with and without enabling this option.
- */
- public void testStopPositons() throws IOException {
- StringBuilder sb = new StringBuilder();
- ArrayList<String> a = new ArrayList<>();
- for (int i=0; i<20; i++) {
- String w = English.intToEnglish(i).trim();
- sb.append(w).append(" ");
- if (i%3 != 0) a.add(w);
- }
- log(sb.toString());
- String stopWords[] = a.toArray(new String[0]);
- for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
- CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
- // with increments
- StringReader reader = new StringReader(sb.toString());
- final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- in.setReader(reader);
- StopFilter stpf = new StopFilter(in, stopSet);
- doTestStopPositons(stpf);
- // with increments, concatenating two stop filters
- ArrayList<String> a0 = new ArrayList<>();
- ArrayList<String> a1 = new ArrayList<>();
- for (int i=0; i<a.size(); i++) {
- if (i%2==0) {
- a0.add(a.get(i));
- } else {
- a1.add(a.get(i));
- }
- }
- String stopWords0[] = a0.toArray(new String[0]);
- for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
- String stopWords1[] = a1.toArray(new String[0]);
- for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
- CharArraySet stopSet0 = StopFilter.makeStopSet(stopWords0);
- CharArraySet stopSet1 = StopFilter.makeStopSet(stopWords1);
- reader = new StringReader(sb.toString());
- final MockTokenizer in1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- in1.setReader(reader);
- StopFilter stpf0 = new StopFilter(in1, stopSet0); // first part of the set
- StopFilter stpf01 = new StopFilter(stpf0, stopSet1); // two stop filters concatenated!
- doTestStopPositons(stpf01);
- }
-
- // LUCENE-3849: make sure after .end() we see the "ending" posInc
- public void testEndStopword() throws Exception {
- CharArraySet stopSet = StopFilter.makeStopSet("of");
- final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- in.setReader(new StringReader("test of"));
- StopFilter stpf = new StopFilter(in, stopSet);
- assertTokenStreamContents(stpf, new String[] { "test" },
- new int[] {0},
- new int[] {4},
- null,
- new int[] {1},
- null,
- 7,
- 1,
- null,
- true);
- }
-
- private void doTestStopPositons(StopFilter stpf) throws IOException {
- CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
- PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
- stpf.reset();
- for (int i=0; i<20; i+=3) {
- assertTrue(stpf.incrementToken());
- log("Token "+i+": "+stpf);
- String w = English.intToEnglish(i).trim();
- assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
- assertEquals("all but first token must have position increment of 3",i==0?1:3,posIncrAtt.getPositionIncrement());
- }
- assertFalse(stpf.incrementToken());
- stpf.end();
- stpf.close();
- }
-
- // print debug info depending on VERBOSE
- private static void log(String s) {
- if (VERBOSE) {
- System.out.println(s);
- }
- }
-
- // stupid filter that inserts synonym of 'hte' for 'the'
- private class MockSynonymFilter extends TokenFilter {
- State bufferedState;
- CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-
- MockSynonymFilter(TokenStream input) {
- super(input);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (bufferedState != null) {
- restoreState(bufferedState);
- posIncAtt.setPositionIncrement(0);
- termAtt.setEmpty().append("hte");
- bufferedState = null;
- return true;
- } else if (input.incrementToken()) {
- if (termAtt.toString().equals("the")) {
- bufferedState = captureState();
- }
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- bufferedState = null;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java
index 9fca6b9..f2d6fe3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java
@@ -17,8 +17,8 @@
package org.apache.lucene.analysis.core;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
index 75ec358..966b1fd 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
@@ -19,9 +19,9 @@ package org.apache.lucene.analysis.cz;
import java.io.IOException;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Test the CzechAnalyzer
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
index 7463f1d..3d45d57 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
@@ -22,11 +22,11 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Test the Czech Stemmer.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
index 918962b..199981e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
index 0e1f093..4c52c0e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
@@ -22,9 +22,9 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
index 75c4499..cb67e93 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
index 80228f7..35a8004 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
index c0c522f..c9d3140 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@@ -22,13 +22,13 @@ import java.io.InputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
index 1fcbbbc..60aa6a8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
index 9563d00..be3e9c4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
@@ -20,14 +20,14 @@ package org.apache.lucene.analysis.en;
import java.io.IOException;
import java.io.StringReader;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
index 4bffffa..39d40f4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java
index d1f64b2..b2a3d68 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
index 67982a2..8cad085 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
@@ -19,7 +19,7 @@ package org.apache.lucene.analysis.fa;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
index e9880c0..83d6dba3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
index 09c2b4e..1313aaf 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
index 36fb0dc..9834621 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Test case for FrenchAnalyzer.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
index 8a526f5..a8e18b0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
index d55fe51..99f9566 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
index 54d7254..50a6294 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
index a215121..78e1719 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
index d1ffe89..e57f6cd 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Simple tests for {@link GalicianMinimalStemmer}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
index 63321d5..f95c455 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.hi;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
index 1ce8d38..cf591db 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
index 3b8951c..67399d0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
index 5f39926..677351e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
@@ -24,11 +24,11 @@ import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.junit.AfterClass;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java
index b9934c8..704187b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
index 424f117..366bad7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
index bba4947..c7c51b8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/lt/TestLithuanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/lt/TestLithuanianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/lt/TestLithuanianAnalyzer.java
index dda018c..86c3f16 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/lt/TestLithuanianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/lt/TestLithuanianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestLithuanianAnalyzer extends BaseTokenStreamTestCase {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
index 4c6e432..5f400b5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
index ef1c30e..5590f04 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
@@ -25,10 +25,10 @@ import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.junit.Test;
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
index 6110e2b..dde6f94 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
@@ -17,8 +17,8 @@
package org.apache.lucene.analysis.miscellaneous;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
@@ -49,4 +49,4 @@ public class TestKeepFilterFactory extends BaseTokenStreamFactoryTestCase {
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
index 847b26c..19e77b0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
@@ -21,10 +21,10 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
/** Test {@link KeepWordFilter} */
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java
index c5b2481..67a421b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java
@@ -21,11 +21,11 @@ import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.junit.Test;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
index ef4856c..1e4fce0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
@@ -26,13 +26,13 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
-import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.TestUtil;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index c7dfa7d..a22d9c9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -16,19 +16,19 @@
*/
package org.apache.lucene.analysis.miscellaneous;
+import java.io.IOException;
+import java.util.*;
+
import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;
import org.junit.Test;
-import java.io.IOException;
-import java.util.*;
-
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchAnalyzer.java
index b7f3ebc..8055660 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchAnalyzer.java
@@ -19,10 +19,10 @@ package org.apache.lucene.analysis.nl;
import java.io.IOException;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Test the Dutch Stem Filter, which only modifies the term text.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
index 1dd9217..9cb494d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
index 38fe12b..89e52af 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
@@ -23,12 +23,12 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
index d0593dc..69b5b0c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
@@ -23,12 +23,12 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
index 7e4dba7..d948c30 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
index 00a6d0f..95d3ff7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
index e9dd584..b44460f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
index 5209923..7bdaac8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
@@ -17,18 +17,18 @@
package org.apache.lucene.analysis.pt;
-import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
-
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
+
+import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
/**
* Simple tests for {@link PortugueseStemFilter}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
index 1d4e2f5..15c0286 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
index 60e9fb4..174feb1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@@ -19,9 +19,9 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Test case for RussianAnalyzer.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
index 19b9309..604b230 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
index c97ec03..bcdefed 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
@@ -19,15 +19,15 @@ package org.apache.lucene.analysis.shingle;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
index 735f12e..c0127a3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
@@ -23,16 +23,16 @@ import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
[03/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
new file mode 100644
index 0000000..943e427
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+import org.apache.lucene.analysis.WordlistLoader;
+
+public class TestWordlistLoader extends LuceneTestCase {
+
+ public void testWordlistLoading() throws IOException {
+ String s = "ONE\n two \nthree";
+ CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
+ checkSet(wordSet1);
+ CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
+ checkSet(wordSet2);
+ }
+
+ public void testComments() throws Exception {
+ String s = "ONE\n two \nthree\n#comment";
+ CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+ checkSet(wordSet1);
+ assertFalse(wordSet1.contains("#comment"));
+ assertFalse(wordSet1.contains("comment"));
+ }
+
+
+ private void checkSet(CharArraySet wordset) {
+ assertEquals(3, wordset.size());
+ assertTrue(wordset.contains("ONE")); // case is not modified
+ assertTrue(wordset.contains("two")); // surrounding whitespace is removed
+ assertTrue(wordset.contains("three"));
+ assertFalse(wordset.contains("four"));
+ }
+
+ /**
+ * Test stopwords in snowball format
+ */
+ public void testSnowballListLoading() throws IOException {
+ String s =
+ "|comment\n" + // commented line
+ " |comment\n" + // commented line with leading whitespace
+ "\n" + // blank line
+ " \t\n" + // line with only whitespace
+ " |comment | comment\n" + // commented line with comment
+ "ONE\n" + // stopword, in uppercase
+ " two \n" + // stopword with leading/trailing space
+ " three four five \n" + // multiple stopwords
+ "six seven | comment\n"; //multiple stopwords + comment
+ CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
+ assertEquals(7, wordset.size());
+ assertTrue(wordset.contains("ONE"));
+ assertTrue(wordset.contains("two"));
+ assertTrue(wordset.contains("three"));
+ assertTrue(wordset.contains("four"));
+ assertTrue(wordset.contains("five"));
+ assertTrue(wordset.contains("six"));
+ assertTrue(wordset.contains("seven"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
new file mode 100644
index 0000000..6c6ddc8
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -0,0 +1,390 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Random;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockGraphTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.TestUtil;
+
+public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
+
+ // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
+ @Slow
+ public void testLargePartiallyMatchingToken() throws Exception {
+ // TODO: get these lists of chars matching a property from ICU4J
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
+
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ int[] WordBreak_Format_chars // only the first char in ranges
+ = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
+ 0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
+
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ int[] WordBreak_Extend_chars // only the first char in ranges
+ = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
+ 0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
+ 0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
+ 0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
+ 0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
+ 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
+ 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
+ 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
+ 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
+ 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
+ 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
+ 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
+ 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
+ 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
+ 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
+ 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
+ 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
+ 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
+ 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
+ 0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
+
+ StringBuilder builder = new StringBuilder();
+ int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
+ for (int i = 0 ; i < numChars ; ) {
+ builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
+ ++i;
+ if (random().nextBoolean()) {
+ int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
+ for (int j = 0; j < numFormatExtendChars; ++j) {
+ int codepoint;
+ if (random().nextBoolean()) {
+ codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
+ } else {
+ codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
+ }
+ char[] chars = Character.toChars(codepoint);
+ builder.append(chars);
+ i += chars.length;
+ }
+ }
+ }
+ StandardTokenizer ts = new StandardTokenizer();
+ ts.setReader(new StringReader(builder.toString()));
+ ts.reset();
+ while (ts.incrementToken()) { }
+ ts.end();
+ ts.close();
+
+ int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+ ts.setMaxTokenLength(newBufferSize); // try a different buffer size
+ ts.setReader(new StringReader(builder.toString()));
+ ts.reset();
+ while (ts.incrementToken()) { }
+ ts.end();
+ ts.close();
+ }
+
+ public void testHugeDoc() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ char whitespace[] = new char[4094];
+ Arrays.fill(whitespace, ' ');
+ sb.append(whitespace);
+ sb.append("testing 1234");
+ String input = sb.toString();
+ StandardTokenizer tokenizer = new StandardTokenizer();
+ tokenizer.setReader(new StringReader(input));
+ BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+ }
+
+ private Analyzer a;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ a.close();
+ super.tearDown();
+ }
+
+ public void testArmenian() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b 13 \u0574\u056b\u056c\u056b\u0578\u0576 \u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568 (4,600` \u0570\u0561\u0575\u0565\u0580\u0565\u0576 \u057e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u0578\u0582\u0574) \u0563\u0580\u057e\u0565\u056c \u0565\u0576 \u056f\u0561\u0574\u0561\u057e\u0578\u0580\u0576\u0565\u0580\u056b \u056f\u0578\u0572\u0574\u056b\u0581 \u0578\u0582 \u0570\u0561\u0574\u0561\u0580\u0575\u0561 \u0562\u0578\u056c\u0578\u0580 \u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568 \u056f\u0561\u0580\u0578\u0572 \u0567 \u056d\u0574\u0562\u0561\u0563\u0580\u0565\u056c \u0581\u0561\u0576\u056f\u0561\u0581 \u0574\u0561\u0580\u0564 \u0578\u057e \u056f\u0561\u0580\u0578\u0572 \u0567 \u0562\u0561\u0581\u0565\u056c \u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b \u056f\u0561\u0575\u0584\u0568\u0589",
+ new String[] { "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b", "13", "\u0574\u056b\u056c\u056b\u0578\u0576", "\u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568", "4,600", "\u0570\u0561\u0575\u0565\u0580\u0565\u0576", "\u057e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u0578\u0582\u0574", "\u0563\u0580\u057e\u0565\u056c", "\u0565\u0576", "\u056f\u0561\u0574\u0561\u057e\u0578\u0580\u0576\u0565\u0580\u056b", "\u056f\u0578\u0572\u0574\u056b\u0581",
+ "\u0578\u0582", "\u0570\u0561\u0574\u0561\u0580\u0575\u0561", "\u0562\u0578\u056c\u0578\u0580", "\u0570\u0578\u0564\u057e\u0561\u056e\u0576\u0565\u0580\u0568", "\u056f\u0561\u0580\u0578\u0572", "\u0567", "\u056d\u0574\u0562\u0561\u0563\u0580\u0565\u056c", "\u0581\u0561\u0576\u056f\u0561\u0581", "\u0574\u0561\u0580\u0564", "\u0578\u057e", "\u056f\u0561\u0580\u0578\u0572", "\u0567", "\u0562\u0561\u0581\u0565\u056c", "\u054e\u056b\u0584\u056b\u057a\u0565\u0564\u056b\u0561\u0575\u056b", "\u056f\u0561\u0575\u0584\u0568" } );
+ }
+
+ public void testAmharic() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u12ca\u12aa\u1354\u12f5\u12eb \u12e8\u1263\u1208 \u1265\u12d9 \u124b\u1295\u124b \u12e8\u1270\u121f\u120b \u1275\u12ad\u12ad\u1208\u129b\u1293 \u1290\u133b \u1218\u12dd\u1308\u1260 \u12d5\u12cd\u1240\u1275 (\u12a2\u1295\u1233\u12ed\u12ad\u120e\u1352\u12f2\u12eb) \u1290\u12cd\u1362 \u121b\u1295\u129b\u12cd\u121d",
+ new String[] { "\u12ca\u12aa\u1354\u12f5\u12eb", "\u12e8\u1263\u1208", "\u1265\u12d9", "\u124b\u1295\u124b", "\u12e8\u1270\u121f\u120b", "\u1275\u12ad\u12ad\u1208\u129b\u1293", "\u1290\u133b", "\u1218\u12dd\u1308\u1260", "\u12d5\u12cd\u1240\u1275", "\u12a2\u1295\u1233\u12ed\u12ad\u120e\u1352\u12f2\u12eb", "\u1290\u12cd", "\u121b\u1295\u129b\u12cd\u121d" } );
+ }
+
+ public void testArabic() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0627\u0644\u0641\u064a\u0644\u0645 \u0627\u0644\u0648\u062b\u0627\u0626\u0642\u064a \u0627\u0644\u0623\u0648\u0644 \u0639\u0646 \u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u064a\u0633\u0645\u0649 \"\u0627\u0644\u062d\u0642\u064a\u0642\u0629 \u0628\u0627\u0644\u0623\u0631\u0642\u0627\u0645: \u0642\u0635\u0629 \u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627\" (\u0628\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629: Truth in Numbers: The Wikipedia Story)\u060c \u0633\u064a\u062a\u0645 \u0625\u0637\u0644\u0627\u0642\u0647 \u0641\u064a 2008.",
+ new String[] { "\u0627\u0644\u0641\u064a\u0644\u0645", "\u0627\u0644\u0648\u062b\u0627\u0626\u0642\u064a", "\u0627\u0644\u0623\u0648\u0644", "\u0639\u0646", "\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627", "\u064a\u0633\u0645\u0649", "\u0627\u0644\u062d\u0642\u064a\u0642\u0629", "\u0628\u0627\u0644\u0623\u0631\u0642\u0627\u0645", "\u0642\u0635\u0629", "\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627",
+ "\u0628\u0627\u0644\u0625\u0646\u062c\u0644\u064a\u0632\u064a\u0629", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "\u0633\u064a\u062a\u0645", "\u0625\u0637\u0644\u0627\u0642\u0647", "\u0641\u064a", "2008" } );
+ }
+
+ public void testAramaic() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0718\u071d\u0729\u071d\u0726\u0715\u071d\u0710 (\u0710\u0722\u0713\u0720\u071d\u0710: Wikipedia) \u0717\u0718 \u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710 \u071a\u0710\u072a\u072c\u0710 \u0715\u0710\u0722\u071b\u072a\u0722\u071b \u0712\u0720\u072b\u0722\u0308\u0710 \u0723\u0713\u071d\u0710\u0308\u0710\u0702 \u072b\u0721\u0717 \u0710\u072c\u0710 \u0721\u0722 \u0721\u0308\u0720\u072c\u0710 \u0715\"\u0718\u071d\u0729\u071d\" \u0718\"\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710\"\u0700",
+ new String[] { "\u0718\u071d\u0729\u071d\u0726\u0715\u071d\u0710", "\u0710\u0722\u0713\u0720\u071d\u0710", "Wikipedia", "\u0717\u0718", "\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710", "\u071a\u0710\u072a\u072c\u0710", "\u0715\u0710\u0722\u071b\u072a\u0722\u071b", "\u0712\u0720\u072b\u0722\u0308\u0710", "\u0723\u0713\u071d\u0710\u0308\u0710", "\u072b\u0721\u0717",
+ "\u0710\u072c\u0710", "\u0721\u0722", "\u0721\u0308\u0720\u072c\u0710", "\u0715", "\u0718\u071d\u0729\u071d", "\u0718", "\u0710\u071d\u0722\u0723\u0729\u0720\u0718\u0726\u0715\u071d\u0710"});
+ }
+
+ public void testBengali() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u098f\u0987 \u09ac\u09bf\u09b6\u09cd\u09ac\u0995\u09cb\u09b7 \u09aa\u09b0\u09bf\u099a\u09be\u09b2\u09a8\u09be \u0995\u09b0\u09c7 \u0989\u0987\u0995\u09bf\u09ae\u09bf\u09a1\u09bf\u09af\u09bc\u09be \u09ab\u09be\u0989\u09a8\u09cd\u09a1\u09c7\u09b6\u09a8 (\u098f\u0995\u099f\u09bf \u0985\u09b2\u09be\u09ad\u099c\u09a8\u0995 \u09b8\u0982\u09b8\u09cd\u09a5\u09be)\u0964 \u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be\u09b0 \u09b6\u09c1\u09b0\u09c1 \u09e7\u09eb \u099c\u09be\u09a8\u09c1\u09af\u09bc\u09be\u09b0\u09bf, \u09e8\u09e6\u09e6\u09e7 \u09b8\u09be\u09b2\u09c7\u0964 \u098f\u0996\u09a8 \u09aa\u09b0\u09cd\u09af\u09a8\u09cd\u09a4 \u09e8\u09e6\u09e6\u099f\u09bf\u09b0\u0993 \u09ac\u09c7\u09b6\u09c0 \u09ad\u09be\u09b7\u09be\u09af\u09bc \u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be \u09b0\u09af\u09bc\u09c7\u099b\u09c7\u0964",
+ new String[] { "\u098f\u0987", "\u09ac\u09bf\u09b6\u09cd\u09ac\u0995\u09cb\u09b7", "\u09aa\u09b0\u09bf\u099a\u09be\u09b2\u09a8\u09be", "\u0995\u09b0\u09c7", "\u0989\u0987\u0995\u09bf\u09ae\u09bf\u09a1\u09bf\u09af\u09bc\u09be", "\u09ab\u09be\u0989\u09a8\u09cd\u09a1\u09c7\u09b6\u09a8", "\u098f\u0995\u099f\u09bf", "\u0985\u09b2\u09be\u09ad\u099c\u09a8\u0995", "\u09b8\u0982\u09b8\u09cd\u09a5\u09be", "\u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be\u09b0",
+ "\u09b6\u09c1\u09b0\u09c1", "\u09e7\u09eb", "\u099c\u09be\u09a8\u09c1\u09af\u09bc\u09be\u09b0\u09bf", "\u09e8\u09e6\u09e6\u09e7", "\u09b8\u09be\u09b2\u09c7", "\u098f\u0996\u09a8", "\u09aa\u09b0\u09cd\u09af\u09a8\u09cd\u09a4", "\u09e8\u09e6\u09e6\u099f\u09bf\u09b0\u0993", "\u09ac\u09c7\u09b6\u09c0", "\u09ad\u09be\u09b7\u09be\u09af\u09bc", "\u0989\u0987\u0995\u09bf\u09aa\u09bf\u09a1\u09bf\u09af\u09bc\u09be", "\u09b0\u09af\u09bc\u09c7\u099b\u09c7" });
+ }
+
+ public void testFarsi() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0648\u06cc\u06a9\u06cc \u067e\u062f\u06cc\u0627\u06cc \u0627\u0646\u06af\u0644\u06cc\u0633\u06cc \u062f\u0631 \u062a\u0627\u0631\u06cc\u062e \u06f2\u06f5 \u062f\u06cc \u06f1\u06f3\u06f7\u06f9 \u0628\u0647 \u0635\u0648\u0631\u062a \u0645\u06a9\u0645\u0644\u06cc \u0628\u0631\u0627\u06cc \u062f\u0627\u0646\u0634\u0646\u0627\u0645\u0647\u0654 \u062a\u062e\u0635\u0635\u06cc \u0646\u0648\u067e\u062f\u06cc\u0627 \u0646\u0648\u0634\u062a\u0647 \u0634\u062f.",
+ new String[] { "\u0648\u06cc\u06a9\u06cc", "\u067e\u062f\u06cc\u0627\u06cc", "\u0627\u0646\u06af\u0644\u06cc\u0633\u06cc", "\u062f\u0631", "\u062a\u0627\u0631\u06cc\u062e", "\u06f2\u06f5", "\u062f\u06cc", "\u06f1\u06f3\u06f7\u06f9", "\u0628\u0647", "\u0635\u0648\u0631\u062a", "\u0645\u06a9\u0645\u0644\u06cc",
+ "\u0628\u0631\u0627\u06cc", "\u062f\u0627\u0646\u0634\u0646\u0627\u0645\u0647\u0654", "\u062a\u062e\u0635\u0635\u06cc", "\u0646\u0648\u067e\u062f\u06cc\u0627", "\u0646\u0648\u0634\u062a\u0647", "\u0634\u062f" });
+ }
+
+ public void testGreek() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0393\u03c1\u03ac\u03c6\u03b5\u03c4\u03b1\u03b9 \u03c3\u03b5 \u03c3\u03c5\u03bd\u03b5\u03c1\u03b3\u03b1\u03c3\u03af\u03b1 \u03b1\u03c0\u03cc \u03b5\u03b8\u03b5\u03bb\u03bf\u03bd\u03c4\u03ad\u03c2 \u03bc\u03b5 \u03c4\u03bf \u03bb\u03bf\u03b3\u03b9\u03c3\u03bc\u03b9\u03ba\u03cc wiki, \u03ba\u03ac\u03c4\u03b9 \u03c0\u03bf\u03c5 \u03c3\u03b7\u03bc\u03b1\u03af\u03bd\u03b5\u03b9 \u03cc\u03c4\u03b9 \u03ac\u03c1\u03b8\u03c1\u03b1 \u03bc\u03c0\u03bf\u03c1\u03b5\u03af \u03bd\u03b1 \u03c0\u03c1\u03bf\u03c3\u03c4\u03b5\u03b8\u03bf\u03cd\u03bd \u03ae \u03bd\u03b1 \u03b1\u03bb\u03bb\u03ac\u03be\u03bf\u03c5\u03bd \u03b1\u03c0\u03cc \u03c4\u03bf\u03bd \u03ba\u03b1\u03b8\u03ad\u03bd\u03b1.",
+ new String[] { "\u0393\u03c1\u03ac\u03c6\u03b5\u03c4\u03b1\u03b9", "\u03c3\u03b5", "\u03c3\u03c5\u03bd\u03b5\u03c1\u03b3\u03b1\u03c3\u03af\u03b1", "\u03b1\u03c0\u03cc", "\u03b5\u03b8\u03b5\u03bb\u03bf\u03bd\u03c4\u03ad\u03c2", "\u03bc\u03b5", "\u03c4\u03bf", "\u03bb\u03bf\u03b3\u03b9\u03c3\u03bc\u03b9\u03ba\u03cc", "wiki", "\u03ba\u03ac\u03c4\u03b9", "\u03c0\u03bf\u03c5",
+ "\u03c3\u03b7\u03bc\u03b1\u03af\u03bd\u03b5\u03b9", "\u03cc\u03c4\u03b9", "\u03ac\u03c1\u03b8\u03c1\u03b1", "\u03bc\u03c0\u03bf\u03c1\u03b5\u03af", "\u03bd\u03b1", "\u03c0\u03c1\u03bf\u03c3\u03c4\u03b5\u03b8\u03bf\u03cd\u03bd", "\u03ae", "\u03bd\u03b1", "\u03b1\u03bb\u03bb\u03ac\u03be\u03bf\u03c5\u03bd", "\u03b1\u03c0\u03cc", "\u03c4\u03bf\u03bd", "\u03ba\u03b1\u03b8\u03ad\u03bd\u03b1" });
+ }
+
+ public void testThai() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0e01\u0e32\u0e23\u0e17\u0e35\u0e48\u0e44\u0e14\u0e49\u0e15\u0e49\u0e2d\u0e07\u0e41\u0e2a\u0e14\u0e07\u0e27\u0e48\u0e32\u0e07\u0e32\u0e19\u0e14\u0e35. \u0e41\u0e25\u0e49\u0e27\u0e40\u0e18\u0e2d\u0e08\u0e30\u0e44\u0e1b\u0e44\u0e2b\u0e19? \u0e51\u0e52\u0e53\u0e54",
+ new String[] { "\u0e01\u0e32\u0e23\u0e17\u0e35\u0e48\u0e44\u0e14\u0e49\u0e15\u0e49\u0e2d\u0e07\u0e41\u0e2a\u0e14\u0e07\u0e27\u0e48\u0e32\u0e07\u0e32\u0e19\u0e14\u0e35", "\u0e41\u0e25\u0e49\u0e27\u0e40\u0e18\u0e2d\u0e08\u0e30\u0e44\u0e1b\u0e44\u0e2b\u0e19", "\u0e51\u0e52\u0e53\u0e54" });
+ }
+
+ public void testLao() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0eaa\u0eb2\u0e97\u0eb2\u0ea5\u0eb0\u0e99\u0eb0\u0ea5\u0eb1\u0e94 \u0e9b\u0eb0\u0e8a\u0eb2\u0e97\u0eb4\u0e9b\u0eb0\u0ec4\u0e95 \u0e9b\u0eb0\u0e8a\u0eb2\u0e8a\u0ebb\u0e99\u0ea5\u0eb2\u0ea7",
+ new String[] { "\u0eaa\u0eb2\u0e97\u0eb2\u0ea5\u0eb0\u0e99\u0eb0\u0ea5\u0eb1\u0e94", "\u0e9b\u0eb0\u0e8a\u0eb2\u0e97\u0eb4\u0e9b\u0eb0\u0ec4\u0e95", "\u0e9b\u0eb0\u0e8a\u0eb2\u0e8a\u0ebb\u0e99\u0ea5\u0eb2\u0ea7" });
+ }
+
+ public void testTibetan() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u0f66\u0fa3\u0f7c\u0f53\u0f0b\u0f58\u0f5b\u0f7c\u0f51\u0f0b\u0f51\u0f44\u0f0b\u0f63\u0f66\u0f0b\u0f60\u0f51\u0f72\u0f66\u0f0b\u0f56\u0f7c\u0f51\u0f0b\u0f61\u0f72\u0f42\u0f0b\u0f58\u0f72\u0f0b\u0f49\u0f58\u0f66\u0f0b\u0f42\u0f7c\u0f44\u0f0b\u0f60\u0f55\u0f7a\u0f63\u0f0b\u0f51\u0f74\u0f0b\u0f42\u0f4f\u0f7c\u0f44\u0f0b\u0f56\u0f62\u0f0b\u0f67\u0f0b\u0f45\u0f44\u0f0b\u0f51\u0f42\u0f7a\u0f0b\u0f58\u0f5a\u0f53\u0f0b\u0f58\u0f46\u0f72\u0f66\u0f0b\u0f66\u0f7c\u0f0d \u0f0d",
+ new String[] { "\u0f66\u0fa3\u0f7c\u0f53", "\u0f58\u0f5b\u0f7c\u0f51", "\u0f51\u0f44", "\u0f63\u0f66", "\u0f60\u0f51\u0f72\u0f66", "\u0f56\u0f7c\u0f51", "\u0f61\u0f72\u0f42",
+ "\u0f58\u0f72", "\u0f49\u0f58\u0f66", "\u0f42\u0f7c\u0f44", "\u0f60\u0f55\u0f7a\u0f63", "\u0f51\u0f74", "\u0f42\u0f4f\u0f7c\u0f44", "\u0f56\u0f62",
+ "\u0f67", "\u0f45\u0f44", "\u0f51\u0f42\u0f7a", "\u0f58\u0f5a\u0f53", "\u0f58\u0f46\u0f72\u0f66", "\u0f66\u0f7c" });
+ }
+
+ /*
+ * For chinese, tokenize as char (these can later form bigrams or whatever)
+ */
+ public void testChinese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u6211\u662f\u4e2d\u56fd\u4eba\u3002 \uff11\uff12\uff13\uff14 \uff34\uff45\uff53\uff54\uff53 ",
+ new String[] { "\u6211", "\u662f", "\u4e2d", "\u56fd", "\u4eba", "\uff11\uff12\uff13\uff14", "\uff34\uff45\uff53\uff54\uff53"});
+ }
+
+ public void testEmpty() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
+ }
+
+ /* test various jira issues this analyzer is related to */
+
+ public void testLUCENE1545() throws Exception {
+ /*
+ * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
+ * The word "mo\u0364chte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
+ * Expected result is only on token "mo\u0364chte".
+ */
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "mo\u0364chte", new String[] { "mo\u0364chte" });
+ }
+
+ /* Tests from StandardAnalyzer, just to show behavior is similar */
+ public void testAlphanumericSA() throws Exception {
+ // alphanumeric tokens
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
+ }
+
+ public void testDelimitersSA() throws Exception {
+ // other delimiters: "-", "/", ","
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+ }
+
+ public void testApostrophesSA() throws Exception {
+ // internal apostrophes: O'Reilly, you're, O'Reilly's
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+ }
+
+ public void testNumericSA() throws Exception {
+ // floating point, serial, model numbers, ip addresses, etc.
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ }
+
+ public void testTextWithNumbersSA() throws Exception {
+ // numbers
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+ }
+
+ public void testVariousTextSA() throws Exception {
+ // various
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+ }
+
+ public void testKoreanSA() throws Exception {
+ // Korean words
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\uc548\ub155\ud558\uc138\uc694 \ud55c\uae00\uc785\ub2c8\ub2e4", new String[]{"\uc548\ub155\ud558\uc138\uc694", "\ud55c\uae00\uc785\ub2c8\ub2e4"});
+ }
+
+ public void testOffsets() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
+ new String[] {"David", "has", "5000", "bones"},
+ new int[] {0, 6, 10, 15},
+ new int[] {5, 9, 14, 20});
+ }
+
+ public void testTypes() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
+ new String[] {"David", "has", "5000", "bones"},
+ new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
+ }
+
+ public void testUnicodeWordBreaks() throws Exception {
+ WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+ wordBreakTest.test(a);
+ }
+
+ public void testSupplementary() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\U00029b05\u8271\u935f\u41f9\u612f\u701b",
+ new String[] {"\U00029b05", "\u8271", "\u935f", "\u41f9", "\u612f", "\u701b"},
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\ud6c8\ubbfc\uc815\uc74c",
+ new String[] { "\ud6c8\ubbfc\uc815\uc74c" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u4eee\u540d\u9063\u3044 \u30ab\u30bf\u30ab\u30ca",
+ new String[] { "\u4eee", "\u540d", "\u9063", "\u3044", "\u30ab\u30bf\u30ab\u30ca" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
+
+ public void testCombiningMarks() throws Exception {
+ checkOneTerm(a, "\u3055\u3099", "\u3055\u3099"); // hiragana
+ checkOneTerm(a, "\u30b5\u3099", "\u30b5\u3099"); // katakana
+ checkOneTerm(a, "\u58f9\u3099", "\u58f9\u3099"); // ideographic
+ checkOneTerm(a, "\uc544\u3099", "\uc544\u3099"); // hangul
+ }
+
+ /**
+ * Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
+ * and/or \p{MidNum} should trigger a token split.
+ */
+ public void testMid() throws Exception {
+ // ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
+
+ // '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
+
+ // ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
+
+ // Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
+
+ // Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
+
+ // '_' is in \p{WB:ExtendNumLet}
+
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A:B", new String[] { "A:B_A:B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A::B", new String[] { "A:B_A", "B" });
+
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1.2", new String[] { "1.2_1.2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A.B", new String[] { "A.B_A.B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1..2", new String[] { "1.2_1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A..B", new String[] { "A.B_A", "B" });
+
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,2", new String[] { "1,2_1,2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,,2", new String[] { "1,2_1", "2" });
+
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A.:B", new String[] { "C_A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A:.B", new String[] { "C_A", "B" });
+
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1,.2", new String[] { "3_1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
+ }
+
+
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ Analyzer analyzer = new StandardAnalyzer();
+ checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+ analyzer.close();
+ }
+
+ /** blast some random large strings through the analyzer */
+ public void testRandomHugeStrings() throws Exception {
+ Analyzer analyzer = new StandardAnalyzer();
+ checkRandomData(random(), analyzer, 100*RANDOM_MULTIPLIER, 8192);
+ analyzer.close();
+ }
+
+ // Adds random graph after:
+ public void testRandomHugeStringsGraphAfter() throws Exception {
+ Random random = random();
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
+ TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
+ return new TokenStreamComponents(tokenizer, tokenStream);
+ }
+ };
+ checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
+ analyzer.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
index d7aa2bb..736d15d 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
@@ -18,14 +18,14 @@ package org.apache.lucene.search.suggest.analyzing;
import java.io.IOException;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/** Like {@link StopFilter} except it will not remove the
* last token if that token was not followed by some token
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilterFactory.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilterFactory.java
index 32baf08..3e222bc 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilterFactory.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilterFactory.java
@@ -16,16 +16,16 @@
*/
package org.apache.lucene.search.suggest.analyzing;
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader; // jdocs
import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
-
-import java.util.Map;
-import java.io.IOException;
/**
* Factory for {@link SuggestStopFilter}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
index d0d3a41..69d3ed6 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
@@ -28,13 +28,13 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.suggest.Input;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java
index c2b2bed..fe14e23 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java
@@ -23,10 +23,10 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.suggest.Input;
import org.apache.lucene.search.suggest.InputArrayIterator;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
index b26b5332..3e89275 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
@@ -32,16 +32,16 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
-import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.Input;
import org.apache.lucene.search.suggest.InputArrayIterator;
import org.apache.lucene.search.suggest.InputIterator;
+import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
index 44917d2..5ed84e0 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
@@ -19,11 +19,11 @@ package org.apache.lucene.search.suggest.analyzing;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
public class TestSuggestStopFilter extends BaseTokenStreamTestCase {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilterFactory.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilterFactory.java
index 58b1892..69947e4 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilterFactory.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilterFactory.java
@@ -21,8 +21,8 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
[10/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package-info.java
deleted file mode 100644
index afc68ce..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package-info.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Fast, general-purpose grammar-based tokenizers.
- * <p>The <code>org.apache.lucene.analysis.standard</code> package contains three
- * fast grammar-based tokenizers constructed with JFlex:</p>
- * <ul>
- * <li>{@link org.apache.lucene.analysis.standard.StandardTokenizer}:
- * as of Lucene 3.1, implements the Word Break rules from the Unicode Text
- * Segmentation algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
- * <b>not</b> tokenized as single tokens, but are instead split up into
- * tokens according to the UAX#29 word break rules.
- * <br>
- * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes
- * {@link org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer},
- * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter LowerCaseFilter}
- * and {@link org.apache.lucene.analysis.core.StopFilter StopFilter}.
- * When the <code>Version</code> specified in the constructor is lower than
- * 3.1, the {@link org.apache.lucene.analysis.standard.ClassicTokenizer ClassicTokenizer}
- * implementation is invoked.</li>
- * <li>{@link org.apache.lucene.analysis.standard.ClassicTokenizer ClassicTokenizer}:
- * this class was formerly (prior to Lucene 3.1) named
- * <code>StandardTokenizer</code>. (Its tokenization rules are not
- * based on the Unicode Text Segmentation algorithm.)
- * {@link org.apache.lucene.analysis.standard.ClassicAnalyzer ClassicAnalyzer} includes
- * {@link org.apache.lucene.analysis.standard.ClassicTokenizer ClassicTokenizer},
- * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter LowerCaseFilter}
- * and {@link org.apache.lucene.analysis.core.StopFilter StopFilter}.
- * </li>
- * <li>{@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer UAX29URLEmailTokenizer}:
- * implements the Word Break rules from the Unicode Text Segmentation
- * algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * URLs and email addresses are also tokenized according to the relevant RFCs.
- * <br>
- * {@link org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer UAX29URLEmailAnalyzer} includes
- * {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer UAX29URLEmailTokenizer},
- * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
- * {@link org.apache.lucene.analysis.core.LowerCaseFilter LowerCaseFilter}
- * and {@link org.apache.lucene.analysis.core.StopFilter StopFilter}.
- * </li>
- * </ul>
- */
-package org.apache.lucene.analysis.standard;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
new file mode 100644
index 0000000..055d0b2
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
@@ -0,0 +1,50 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in spatial/ -->
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+ Fast, general-purpose grammar-based tokenizers.
+ <ul>
+ <li>{@link org.apache.lucene.analysis.standard.ClassicTokenizer ClassicTokenizer}:
+ this class was formerly (prior to Lucene 3.1) named
+ <code>StandardTokenizer</code>. (Its tokenization rules are not
+ based on the Unicode Text Segmentation algorithm.)
+ {@link org.apache.lucene.analysis.standard.ClassicAnalyzer ClassicAnalyzer} includes
+ {@link org.apache.lucene.analysis.standard.ClassicTokenizer ClassicTokenizer},
+ {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
+ {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
+ and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ </li>
+ <li>{@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer UAX29URLEmailTokenizer}:
+ implements the Word Break rules from the Unicode Text Segmentation
+ algorithm, as specified in
+ <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>, except
+ URLs and email addresses are also tokenized according to the relevant RFCs.
+ <br>
+ {@link org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer UAX29URLEmailAnalyzer} includes
+ {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer UAX29URLEmailTokenizer},
+ {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
+ {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
+ and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ </li>
+ </ul>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
index fd15bbd..fd2aa2e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.tartarus.snowball.ext.SwedishStemmer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
index 2488665..8bab9a7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@@ -30,9 +30,9 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index 3f2e52a..9543c5c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -20,13 +20,13 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
index c9ed471..a21495f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
@@ -21,15 +21,15 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.TurkishStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
index 8ee809c..f8de8a7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
@@ -37,7 +37,9 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
deleted file mode 100644
index e414366..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.util.Arrays;
-import java.util.AbstractMap;
-import java.util.AbstractSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.lucene.analysis.util.CharacterUtils;
-
-/**
- * A simple class that stores key Strings as char[]'s in a
- * hash table. Note that this is not a general purpose
- * class. For example, it cannot remove items from the
- * map, nor does it resize its hash table to be smaller,
- * etc. It is designed to be quick to retrieve items
- * by char[] keys without the necessity of converting
- * to a String first.
- */
-public class CharArrayMap<V> extends AbstractMap<Object,V> {
- // private only because missing generics
- private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
-
- private final static int INIT_SIZE = 8;
- private boolean ignoreCase;
- private int count;
- char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
- V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
-
- /**
- * Create map with enough capacity to hold startSize terms
- *
- * @param startSize
- * the initial capacity
- * @param ignoreCase
- * <code>false</code> if and only if the set should be case sensitive
- * otherwise <code>true</code>.
- */
- @SuppressWarnings("unchecked")
- public CharArrayMap(int startSize, boolean ignoreCase) {
- this.ignoreCase = ignoreCase;
- int size = INIT_SIZE;
- while(startSize + (startSize>>2) > size)
- size <<= 1;
- keys = new char[size][];
- values = (V[]) new Object[size];
- }
-
- /**
- * Creates a map from the mappings in another map.
- *
- * @param c
- * a map whose mappings to be copied
- * @param ignoreCase
- * <code>false</code> if and only if the set should be case sensitive
- * otherwise <code>true</code>.
- */
- public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) {
- this(c.size(), ignoreCase);
- putAll(c);
- }
-
- /** Create set from the supplied map (used internally for readonly maps...) */
- private CharArrayMap(CharArrayMap<V> toCopy){
- this.keys = toCopy.keys;
- this.values = toCopy.values;
- this.ignoreCase = toCopy.ignoreCase;
- this.count = toCopy.count;
- }
-
- /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
- @Override
- public void clear() {
- count = 0;
- Arrays.fill(keys, null);
- Arrays.fill(values, null);
- }
-
- /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
- * are in the {@link #keySet()} */
- public boolean containsKey(char[] text, int off, int len) {
- return keys[getSlot(text, off, len)] != null;
- }
-
- /** true if the <code>CharSequence</code> is in the {@link #keySet()} */
- public boolean containsKey(CharSequence cs) {
- return keys[getSlot(cs)] != null;
- }
-
- @Override
- public boolean containsKey(Object o) {
- if (o instanceof char[]) {
- final char[] text = (char[])o;
- return containsKey(text, 0, text.length);
- }
- return containsKey(o.toString());
- }
-
- /** returns the value of the mapping of <code>len</code> chars of <code>text</code>
- * starting at <code>off</code> */
- public V get(char[] text, int off, int len) {
- return values[getSlot(text, off, len)];
- }
-
- /** returns the value of the mapping of the chars inside this {@code CharSequence} */
- public V get(CharSequence cs) {
- return values[getSlot(cs)];
- }
-
- @Override
- public V get(Object o) {
- if (o instanceof char[]) {
- final char[] text = (char[])o;
- return get(text, 0, text.length);
- }
- return get(o.toString());
- }
-
- private int getSlot(char[] text, int off, int len) {
- int code = getHashCode(text, off, len);
- int pos = code & (keys.length-1);
- char[] text2 = keys[pos];
- if (text2 != null && !equals(text, off, len, text2)) {
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- pos = code & (keys.length-1);
- text2 = keys[pos];
- } while (text2 != null && !equals(text, off, len, text2));
- }
- return pos;
- }
-
- /** Returns true if the String is in the set */
- private int getSlot(CharSequence text) {
- int code = getHashCode(text);
- int pos = code & (keys.length-1);
- char[] text2 = keys[pos];
- if (text2 != null && !equals(text, text2)) {
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- pos = code & (keys.length-1);
- text2 = keys[pos];
- } while (text2 != null && !equals(text, text2));
- }
- return pos;
- }
-
- /** Add the given mapping. */
- public V put(CharSequence text, V value) {
- return put(text.toString(), value); // could be more efficient
- }
-
- @Override
- public V put(Object o, V value) {
- if (o instanceof char[]) {
- return put((char[])o, value);
- }
- return put(o.toString(), value);
- }
-
- /** Add the given mapping. */
- public V put(String text, V value) {
- return put(text.toCharArray(), value);
- }
-
- /** Add the given mapping.
- * If ignoreCase is true for this Set, the text array will be directly modified.
- * The user should never modify this text array after calling this method.
- */
- public V put(char[] text, V value) {
- if (ignoreCase) {
- CharacterUtils.toLowerCase(text, 0, text.length);
- }
- int slot = getSlot(text, 0, text.length);
- if (keys[slot] != null) {
- final V oldValue = values[slot];
- values[slot] = value;
- return oldValue;
- }
- keys[slot] = text;
- values[slot] = value;
- count++;
-
- if (count + (count>>2) > keys.length) {
- rehash();
- }
-
- return null;
- }
-
- @SuppressWarnings("unchecked")
- private void rehash() {
- assert keys.length == values.length;
- final int newSize = 2*keys.length;
- final char[][] oldkeys = keys;
- final V[] oldvalues = values;
- keys = new char[newSize][];
- values = (V[]) new Object[newSize];
-
- for(int i=0; i<oldkeys.length; i++) {
- char[] text = oldkeys[i];
- if (text != null) {
- // todo: could be faster... no need to compare strings on collision
- final int slot = getSlot(text,0,text.length);
- keys[slot] = text;
- values[slot] = oldvalues[i];
- }
- }
- }
-
- private boolean equals(char[] text1, int off, int len, char[] text2) {
- if (len != text2.length)
- return false;
- final int limit = off+len;
- if (ignoreCase) {
- for(int i=0;i<len;) {
- final int codePointAt = Character.codePointAt(text1, off+i, limit);
- if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
- return false;
- i += Character.charCount(codePointAt);
- }
- } else {
- for(int i=0;i<len;i++) {
- if (text1[off+i] != text2[i])
- return false;
- }
- }
- return true;
- }
-
- private boolean equals(CharSequence text1, char[] text2) {
- int len = text1.length();
- if (len != text2.length)
- return false;
- if (ignoreCase) {
- for(int i=0;i<len;) {
- final int codePointAt = Character.codePointAt(text1, i);
- if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
- return false;
- i += Character.charCount(codePointAt);
- }
- } else {
- for(int i=0;i<len;i++) {
- if (text1.charAt(i) != text2[i])
- return false;
- }
- }
- return true;
- }
-
- private int getHashCode(char[] text, int offset, int len) {
- if (text == null)
- throw new NullPointerException();
- int code = 0;
- final int stop = offset + len;
- if (ignoreCase) {
- for (int i=offset; i<stop;) {
- final int codePointAt = Character.codePointAt(text, i, stop);
- code = code*31 + Character.toLowerCase(codePointAt);
- i += Character.charCount(codePointAt);
- }
- } else {
- for (int i=offset; i<stop; i++) {
- code = code*31 + text[i];
- }
- }
- return code;
- }
-
- private int getHashCode(CharSequence text) {
- if (text == null)
- throw new NullPointerException();
- int code = 0;
- int len = text.length();
- if (ignoreCase) {
- for (int i=0; i<len;) {
- int codePointAt = Character.codePointAt(text, i);
- code = code*31 + Character.toLowerCase(codePointAt);
- i += Character.charCount(codePointAt);
- }
- } else {
- for (int i=0; i<len; i++) {
- code = code*31 + text.charAt(i);
- }
- }
- return code;
- }
-
- @Override
- public V remove(Object key) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int size() {
- return count;
- }
-
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder("{");
- for (Map.Entry<Object,V> entry : entrySet()) {
- if (sb.length()>1) sb.append(", ");
- sb.append(entry);
- }
- return sb.append('}').toString();
- }
-
- private EntrySet entrySet = null;
- private CharArraySet keySet = null;
-
- EntrySet createEntrySet() {
- return new EntrySet(true);
- }
-
- @Override
- public final EntrySet entrySet() {
- if (entrySet == null) {
- entrySet = createEntrySet();
- }
- return entrySet;
- }
-
- // helper for CharArraySet to not produce endless recursion
- final Set<Object> originalKeySet() {
- return super.keySet();
- }
-
- /** Returns an {@link CharArraySet} view on the map's keys.
- * The set will use the same {@code matchVersion} as this map. */
- @Override @SuppressWarnings({"unchecked","rawtypes"})
- public final CharArraySet keySet() {
- if (keySet == null) {
- // prevent adding of entries
- keySet = new CharArraySet((CharArrayMap) this) {
- @Override
- public boolean add(Object o) {
- throw new UnsupportedOperationException();
- }
- @Override
- public boolean add(CharSequence text) {
- throw new UnsupportedOperationException();
- }
- @Override
- public boolean add(String text) {
- throw new UnsupportedOperationException();
- }
- @Override
- public boolean add(char[] text) {
- throw new UnsupportedOperationException();
- }
- };
- }
- return keySet;
- }
-
- /** public iterator class so efficient methods are exposed to users */
- public class EntryIterator implements Iterator<Map.Entry<Object,V>> {
- private int pos=-1;
- private int lastPos;
- private final boolean allowModify;
-
- private EntryIterator(boolean allowModify) {
- this.allowModify = allowModify;
- goNext();
- }
-
- private void goNext() {
- lastPos = pos;
- pos++;
- while (pos < keys.length && keys[pos] == null) pos++;
- }
-
- @Override
- public boolean hasNext() {
- return pos < keys.length;
- }
-
- /** gets the next key... do not modify the returned char[] */
- public char[] nextKey() {
- goNext();
- return keys[lastPos];
- }
-
- /** gets the next key as a newly created String object */
- public String nextKeyString() {
- return new String(nextKey());
- }
-
- /** returns the value associated with the last key returned */
- public V currentValue() {
- return values[lastPos];
- }
-
- /** sets the value associated with the last key returned */
- public V setValue(V value) {
- if (!allowModify)
- throw new UnsupportedOperationException();
- V old = values[lastPos];
- values[lastPos] = value;
- return old;
- }
-
- /** use nextCharArray() + currentValue() for better efficiency. */
- @Override
- public Map.Entry<Object,V> next() {
- goNext();
- return new MapEntry(lastPos, allowModify);
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- private final class MapEntry implements Map.Entry<Object,V> {
- private final int pos;
- private final boolean allowModify;
-
- private MapEntry(int pos, boolean allowModify) {
- this.pos = pos;
- this.allowModify = allowModify;
- }
-
- @Override
- public Object getKey() {
- // we must clone here, as putAll to another CharArrayMap
- // with other case sensitivity flag would corrupt the keys
- return keys[pos].clone();
- }
-
- @Override
- public V getValue() {
- return values[pos];
- }
-
- @Override
- public V setValue(V value) {
- if (!allowModify)
- throw new UnsupportedOperationException();
- final V old = values[pos];
- values[pos] = value;
- return old;
- }
-
- @Override
- public String toString() {
- return new StringBuilder().append(keys[pos]).append('=')
- .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos])
- .toString();
- }
- }
-
- /** public EntrySet class so efficient methods are exposed to users */
- public final class EntrySet extends AbstractSet<Map.Entry<Object,V>> {
- private final boolean allowModify;
-
- private EntrySet(boolean allowModify) {
- this.allowModify = allowModify;
- }
-
- @Override
- public EntryIterator iterator() {
- return new EntryIterator(allowModify);
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public boolean contains(Object o) {
- if (!(o instanceof Map.Entry))
- return false;
- final Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
- final Object key = e.getKey();
- final Object val = e.getValue();
- final Object v = get(key);
- return v == null ? val == null : v.equals(val);
- }
-
- @Override
- public boolean remove(Object o) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int size() {
- return count;
- }
-
- @Override
- public void clear() {
- if (!allowModify)
- throw new UnsupportedOperationException();
- CharArrayMap.this.clear();
- }
- }
-
- /**
- * Returns an unmodifiable {@link CharArrayMap}. This allows to provide
- * unmodifiable views of internal map for "read-only" use.
- *
- * @param map
- * a map for which the unmodifiable map is returned.
- * @return an new unmodifiable {@link CharArrayMap}.
- * @throws NullPointerException
- * if the given map is <code>null</code>.
- */
- public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
- if (map == null)
- throw new NullPointerException("Given map is null");
- if (map == emptyMap() || map.isEmpty())
- return emptyMap();
- if (map instanceof UnmodifiableCharArrayMap)
- return map;
- return new UnmodifiableCharArrayMap<>(map);
- }
-
- /**
- * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
- * is a {@link CharArrayMap} the ignoreCase property will be preserved.
- *
- * @param map
- * a map to copy
- * @return a copy of the given map as a {@link CharArrayMap}. If the given map
- * is a {@link CharArrayMap} the ignoreCase property as well as the
- * matchVersion will be of the given map will be preserved.
- */
- @SuppressWarnings("unchecked")
- public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) {
- if(map == EMPTY_MAP)
- return emptyMap();
- if(map instanceof CharArrayMap) {
- CharArrayMap<V> m = (CharArrayMap<V>) map;
- // use fast path instead of iterating all values
- // this is even on very small sets ~10 times faster than iterating
- final char[][] keys = new char[m.keys.length][];
- System.arraycopy(m.keys, 0, keys, 0, keys.length);
- final V[] values = (V[]) new Object[m.values.length];
- System.arraycopy(m.values, 0, values, 0, values.length);
- m = new CharArrayMap<>(m);
- m.keys = keys;
- m.values = values;
- return m;
- }
- return new CharArrayMap<>(map, false);
- }
-
- /** Returns an empty, unmodifiable map. */
- @SuppressWarnings("unchecked")
- public static <V> CharArrayMap<V> emptyMap() {
- return (CharArrayMap<V>) EMPTY_MAP;
- }
-
- // package private CharArraySet instanceof check in CharArraySet
- static class UnmodifiableCharArrayMap<V> extends CharArrayMap<V> {
-
- UnmodifiableCharArrayMap(CharArrayMap<V> map) {
- super(map);
- }
-
- @Override
- public void clear() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public V put(Object o, V val){
- throw new UnsupportedOperationException();
- }
-
- @Override
- public V put(char[] text, V val) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public V put(CharSequence text, V val) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public V put(String text, V val) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public V remove(Object key) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- EntrySet createEntrySet() {
- return new EntrySet(false);
- }
- }
-
- /**
- * Empty {@link org.apache.lucene.analysis.util.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
- * Contains checks will always return <code>false</code> or throw
- * NPE if necessary.
- */
- private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
- EmptyCharArrayMap() {
- super(new CharArrayMap<V>(0, false));
- }
-
- @Override
- public boolean containsKey(char[] text, int off, int len) {
- if(text == null)
- throw new NullPointerException();
- return false;
- }
-
- @Override
- public boolean containsKey(CharSequence cs) {
- if(cs == null)
- throw new NullPointerException();
- return false;
- }
-
- @Override
- public boolean containsKey(Object o) {
- if(o == null)
- throw new NullPointerException();
- return false;
- }
-
- @Override
- public V get(char[] text, int off, int len) {
- if(text == null)
- throw new NullPointerException();
- return null;
- }
-
- @Override
- public V get(CharSequence cs) {
- if(cs == null)
- throw new NullPointerException();
- return null;
- }
-
- @Override
- public V get(Object o) {
- if(o == null)
- throw new NullPointerException();
- return null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArraySet.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArraySet.java
deleted file mode 100644
index 15485bc..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArraySet.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.util.AbstractSet;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.Set;
-
-/**
- * A simple class that stores Strings as char[]'s in a
- * hash table. Note that this is not a general purpose
- * class. For example, it cannot remove items from the
- * set, nor does it resize its hash table to be smaller,
- * etc. It is designed to be quick to test if a char[]
- * is in the set without the necessity of converting it
- * to a String first.
- *
- * <P>
- * <em>Please note:</em> This class implements {@link java.util.Set Set} but
- * does not behave like it should in all cases. The generic type is
- * {@code Set<Object>}, because you can add any object to it,
- * that has a string representation. The add methods will use
- * {@link Object#toString} and store the result using a {@code char[]}
- * buffer. The same behavior have the {@code contains()} methods.
- * The {@link #iterator()} returns an {@code Iterator<char[]>}.
- */
-public class CharArraySet extends AbstractSet<Object> {
- public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
- private static final Object PLACEHOLDER = new Object();
-
- private final CharArrayMap<Object> map;
-
- /**
- * Create set with enough capacity to hold startSize terms
- *
- * @param startSize
- * the initial capacity
- * @param ignoreCase
- * <code>false</code> if and only if the set should be case sensitive
- * otherwise <code>true</code>.
- */
- public CharArraySet(int startSize, boolean ignoreCase) {
- this(new CharArrayMap<>(startSize, ignoreCase));
- }
-
- /**
- * Creates a set from a Collection of objects.
- *
- * @param c
- * a collection whose elements to be placed into the set
- * @param ignoreCase
- * <code>false</code> if and only if the set should be case sensitive
- * otherwise <code>true</code>.
- */
- public CharArraySet(Collection<?> c, boolean ignoreCase) {
- this(c.size(), ignoreCase);
- addAll(c);
- }
-
- /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
- CharArraySet(final CharArrayMap<Object> map){
- this.map = map;
- }
-
- /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
- @Override
- public void clear() {
- map.clear();
- }
-
- /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
- * are in the set */
- public boolean contains(char[] text, int off, int len) {
- return map.containsKey(text, off, len);
- }
-
- /** true if the <code>CharSequence</code> is in the set */
- public boolean contains(CharSequence cs) {
- return map.containsKey(cs);
- }
-
- @Override
- public boolean contains(Object o) {
- return map.containsKey(o);
- }
-
- @Override
- public boolean add(Object o) {
- return map.put(o, PLACEHOLDER) == null;
- }
-
- /** Add this CharSequence into the set */
- public boolean add(CharSequence text) {
- return map.put(text, PLACEHOLDER) == null;
- }
-
- /** Add this String into the set */
- public boolean add(String text) {
- return map.put(text, PLACEHOLDER) == null;
- }
-
- /** Add this char[] directly to the set.
- * If ignoreCase is true for this Set, the text array will be directly modified.
- * The user should never modify this text array after calling this method.
- */
- public boolean add(char[] text) {
- return map.put(text, PLACEHOLDER) == null;
- }
-
- @Override
- public int size() {
- return map.size();
- }
-
- /**
- * Returns an unmodifiable {@link CharArraySet}. This allows to provide
- * unmodifiable views of internal sets for "read-only" use.
- *
- * @param set
- * a set for which the unmodifiable set is returned.
- * @return an new unmodifiable {@link CharArraySet}.
- * @throws NullPointerException
- * if the given set is <code>null</code>.
- */
- public static CharArraySet unmodifiableSet(CharArraySet set) {
- if (set == null)
- throw new NullPointerException("Given set is null");
- if (set == EMPTY_SET)
- return EMPTY_SET;
- if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
- return set;
- return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
- }
-
- /**
- * Returns a copy of the given set as a {@link CharArraySet}. If the given set
- * is a {@link CharArraySet} the ignoreCase property will be preserved.
- *
- * @param set
- * a set to copy
- * @return a copy of the given set as a {@link CharArraySet}. If the given set
- * is a {@link CharArraySet} the ignoreCase property as well as the
- * matchVersion will be of the given set will be preserved.
- */
- public static CharArraySet copy(final Set<?> set) {
- if(set == EMPTY_SET)
- return EMPTY_SET;
- if(set instanceof CharArraySet) {
- final CharArraySet source = (CharArraySet) set;
- return new CharArraySet(CharArrayMap.copy(source.map));
- }
- return new CharArraySet(set, false);
- }
-
- /**
- * Returns an {@link Iterator} for {@code char[]} instances in this set.
- */
- @Override @SuppressWarnings("unchecked")
- public Iterator<Object> iterator() {
- // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
- return map.originalKeySet().iterator();
- }
-
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder("[");
- for (Object item : this) {
- if (sb.length()>1) sb.append(", ");
- if (item instanceof char[]) {
- sb.append((char[]) item);
- } else {
- sb.append(item);
- }
- }
- return sb.append(']').toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
index 4952f99..9100345 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@@ -22,16 +22,16 @@ import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.IntUnaryOperator;
+import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
/**
* An abstract base class for simple, character-oriented tokenizers.
@@ -285,4 +285,4 @@ public abstract class CharTokenizer extends Tokenizer {
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
deleted file mode 100644
index b728523..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.io.IOException;
-import java.io.Reader;
-
-/**
- * Utility class to write tokenizers or token filters.
- * @lucene.internal
- */
-public final class CharacterUtils {
-
- private CharacterUtils() {} // no instantiation
-
- /**
- * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
- * of the given bufferSize.
- *
- * @param bufferSize
- * the internal char buffer size, must be <code>>= 2</code>
- * @return a new {@link CharacterBuffer} instance.
- */
- public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
- if (bufferSize < 2) {
- throw new IllegalArgumentException("buffersize must be >= 2");
- }
- return new CharacterBuffer(new char[bufferSize], 0, 0);
- }
-
-
- /**
- * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
- * at the given offset.
- * @param buffer the char buffer to lowercase
- * @param offset the offset to start at
- * @param limit the max char in the buffer to lower case
- */
- public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
- assert buffer.length >= limit;
- assert offset <=0 && offset <= buffer.length;
- for (int i = offset; i < limit;) {
- i += Character.toChars(
- Character.toLowerCase(
- Character.codePointAt(buffer, i, limit)), buffer, i);
- }
- }
-
- /**
- * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
- * at the given offset.
- * @param buffer the char buffer to UPPERCASE
- * @param offset the offset to start at
- * @param limit the max char in the buffer to lower case
- */
- public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
- assert buffer.length >= limit;
- assert offset <=0 && offset <= buffer.length;
- for (int i = offset; i < limit;) {
- i += Character.toChars(
- Character.toUpperCase(
- Character.codePointAt(buffer, i, limit)), buffer, i);
- }
- }
-
- /** Converts a sequence of Java characters to a sequence of unicode code points.
- * @return the number of code points written to the destination buffer */
- public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
- if (srcLen < 0) {
- throw new IllegalArgumentException("srcLen must be >= 0");
- }
- int codePointCount = 0;
- for (int i = 0; i < srcLen; ) {
- final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
- final int charCount = Character.charCount(cp);
- dest[destOff + codePointCount++] = cp;
- i += charCount;
- }
- return codePointCount;
- }
-
- /** Converts a sequence of unicode code points to a sequence of Java characters.
- * @return the number of chars written to the destination buffer */
- public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
- if (srcLen < 0) {
- throw new IllegalArgumentException("srcLen must be >= 0");
- }
- int written = 0;
- for (int i = 0; i < srcLen; ++i) {
- written += Character.toChars(src[srcOff + i], dest, destOff + written);
- }
- return written;
- }
-
- /**
- * Fills the {@link CharacterBuffer} with characters read from the given
- * reader {@link Reader}. This method tries to read <code>numChars</code>
- * characters into the {@link CharacterBuffer}, each call to fill will start
- * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
- * In case code points can span across 2 java characters, this method may
- * only fill <code>numChars - 1</code> characters in order not to split in
- * the middle of a surrogate pair, even if there are remaining characters in
- * the {@link Reader}.
- * <p>
- * This method guarantees
- * that the given {@link CharacterBuffer} will never contain a high surrogate
- * character as the last element in the buffer unless it is the last available
- * character in the reader. In other words, high and low surrogate pairs will
- * always be preserved across buffer boarders.
- * </p>
- * <p>
- * A return value of <code>false</code> means that this method call exhausted
- * the reader, but there may be some bytes which have been read, which can be
- * verified by checking whether <code>buffer.getLength() > 0</code>.
- * </p>
- *
- * @param buffer
- * the buffer to fill.
- * @param reader
- * the reader to read characters from.
- * @param numChars
- * the number of chars to read
- * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
- * @throws IOException
- * if the reader throws an {@link IOException}.
- */
- public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
- assert buffer.buffer.length >= 2;
- if (numChars < 2 || numChars > buffer.buffer.length) {
- throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
- }
- final char[] charBuffer = buffer.buffer;
- buffer.offset = 0;
- final int offset;
-
- // Install the previously saved ending high surrogate:
- if (buffer.lastTrailingHighSurrogate != 0) {
- charBuffer[0] = buffer.lastTrailingHighSurrogate;
- buffer.lastTrailingHighSurrogate = 0;
- offset = 1;
- } else {
- offset = 0;
- }
-
- final int read = readFully(reader, charBuffer, offset, numChars - offset);
-
- buffer.length = offset + read;
- final boolean result = buffer.length == numChars;
- if (buffer.length < numChars) {
- // We failed to fill the buffer. Even if the last char is a high
- // surrogate, there is nothing we can do
- return result;
- }
-
- if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
- buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
- }
- return result;
- }
-
- /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
- public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
- return fill(buffer, reader, buffer.buffer.length);
- }
-
- static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
- int read = 0;
- while (read < len) {
- final int r = reader.read(dest, offset + read, len - read);
- if (r == -1) {
- break;
- }
- read += r;
- }
- return read;
- }
-
- /**
- * A simple IO buffer to use with
- * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
- */
- public static final class CharacterBuffer {
-
- private final char[] buffer;
- private int offset;
- private int length;
- // NOTE: not private so outer class can access without
- // $access methods:
- char lastTrailingHighSurrogate;
-
- CharacterBuffer(char[] buffer, int offset, int length) {
- this.buffer = buffer;
- this.offset = offset;
- this.length = length;
- }
-
- /**
- * Returns the internal buffer
- *
- * @return the buffer
- */
- public char[] getBuffer() {
- return buffer;
- }
-
- /**
- * Returns the data offset in the internal buffer.
- *
- * @return the offset
- */
- public int getOffset() {
- return offset;
- }
-
- /**
- * Return the length of the data in the internal buffer starting at
- * {@link #getOffset()}
- *
- * @return the length
- */
- public int getLength() {
- return length;
- }
-
- /**
- * Resets the CharacterBuffer. All internals are reset to its default
- * values.
- */
- public void reset() {
- offset = 0;
- length = 0;
- lastTrailingHighSurrogate = 0;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java
index d7689f9..be5f04c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java
@@ -19,10 +19,10 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java
index 31c3027..fff3edc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.util.Map;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
deleted file mode 100644
index 97d35e2..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/**
- * Abstract base class for TokenFilters that may remove tokens.
- * You have to implement {@link #accept} and return a boolean if the current
- * token should be preserved. {@link #incrementToken} uses this method
- * to decide if a token should be passed to the caller.
- */
-public abstract class FilteringTokenFilter extends TokenFilter {
-
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private int skippedPositions;
-
- /**
- * Create a new {@link FilteringTokenFilter}.
- * @param in the {@link TokenStream} to consume
- */
- public FilteringTokenFilter(TokenStream in) {
- super(in);
- }
-
- /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
- protected abstract boolean accept() throws IOException;
-
- @Override
- public final boolean incrementToken() throws IOException {
- skippedPositions = 0;
- while (input.incrementToken()) {
- if (accept()) {
- if (skippedPositions != 0) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
- }
- skippedPositions += posIncrAtt.getPositionIncrement();
- }
-
- // reached EOS -- return false
- return false;
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- skippedPositions = 0;
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
deleted file mode 100644
index fc6c798..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.util.IOUtils;
-
-/**
- * Base class for Analyzers that need to make use of stopword sets.
- *
- */
-public abstract class StopwordAnalyzerBase extends Analyzer {
-
- /**
- * An immutable stopword set
- */
- protected final CharArraySet stopwords;
-
- /**
- * Returns the analyzer's stopword set or an empty set if the analyzer has no
- * stopwords
- *
- * @return the analyzer's stopword set or an empty set if the analyzer has no
- * stopwords
- */
- public CharArraySet getStopwordSet() {
- return stopwords;
- }
-
- /**
- * Creates a new instance initialized with the given stopword set
- *
- * @param stopwords
- * the analyzer's stopword set
- */
- protected StopwordAnalyzerBase(final CharArraySet stopwords) {
- // analyzers should use char array set for stopwords!
- this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
- .unmodifiableSet(CharArraySet.copy(stopwords));
- }
-
- /**
- * Creates a new Analyzer with an empty stopword set
- */
- protected StopwordAnalyzerBase() {
- this(null);
- }
-
- /**
- * Creates a CharArraySet from a file resource associated with a class. (See
- * {@link Class#getResourceAsStream(String)}).
- *
- * @param ignoreCase
- * <code>true</code> if the set should ignore the case of the
- * stopwords, otherwise <code>false</code>
- * @param aClass
- * a class that is associated with the given stopwordResource
- * @param resource
- * name of the resource file associated with the given class
- * @param comment
- * comment string to ignore in the stopword file
- * @return a CharArraySet containing the distinct stopwords from the given
- * file
- * @throws IOException
- * if loading the stopwords throws an {@link IOException}
- */
- protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
- final Class<? extends Analyzer> aClass, final String resource,
- final String comment) throws IOException {
- Reader reader = null;
- try {
- reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
- return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase));
- } finally {
- IOUtils.close(reader);
- }
-
- }
-
- /**
- * Creates a CharArraySet from a path.
- *
- * @param stopwords
- * the stopwords file to load
- * @return a CharArraySet containing the distinct stopwords from the given
- * file
- * @throws IOException
- * if loading the stopwords throws an {@link IOException}
- */
- protected static CharArraySet loadStopwordSet(Path stopwords) throws IOException {
- Reader reader = null;
- try {
- reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8);
- return WordlistLoader.getWordSet(reader);
- } finally {
- IOUtils.close(reader);
- }
- }
-
- /**
- * Creates a CharArraySet from a file.
- *
- * @param stopwords
- * the stopwords reader to load
- *
- * @return a CharArraySet containing the distinct stopwords from the given
- * reader
- * @throws IOException
- * if loading the stopwords throws an {@link IOException}
- */
- protected static CharArraySet loadStopwordSet(Reader stopwords) throws IOException {
- try {
- return WordlistLoader.getWordSet(stopwords);
- } finally {
- IOUtils.close(stopwords);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
deleted file mode 100644
index 4d99965..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.util.IOUtils;
-
-/**
- * Loader for text files that represent a list of stopwords.
- *
- * @see IOUtils to obtain {@link Reader} instances
- * @lucene.internal
- */
-public class WordlistLoader {
-
- private static final int INITIAL_CAPACITY = 16;
-
- /** no instance */
- private WordlistLoader() {}
-
- /**
- * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
- * leading and trailing whitespace). Every line of the Reader should contain only
- * one word. The words need to be in lowercase if you make use of an
- * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- *
- * @param reader Reader containing the wordlist
- * @param result the {@link CharArraySet} to fill with the readers words
- * @return the given {@link CharArraySet} with the reader's words
- */
- public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
- String word = null;
- while ((word = br.readLine()) != null) {
- result.add(word.trim());
- }
- }
- finally {
- IOUtils.close(br);
- }
- return result;
- }
-
- /**
- * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
- * leading and trailing whitespace). Every line of the Reader should contain only
- * one word. The words need to be in lowercase if you make use of an
- * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- *
- * @param reader Reader containing the wordlist
- * @return A {@link CharArraySet} with the reader's words
- */
- public static CharArraySet getWordSet(Reader reader) throws IOException {
- return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
- }
-
- /**
- * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
- * leading and trailing whitespace). Every line of the Reader should contain only
- * one word. The words need to be in lowercase if you make use of an
- * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- *
- * @param reader Reader containing the wordlist
- * @param comment The string representing a comment.
- * @return A CharArraySet with the reader's words
- */
- public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
- return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
- }
-
- /**
- * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
- * leading and trailing whitespace). Every line of the Reader should contain only
- * one word. The words need to be in lowercase if you make use of an
- * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- *
- * @param reader Reader containing the wordlist
- * @param comment The string representing a comment.
- * @param result the {@link CharArraySet} to fill with the readers words
- * @return the given {@link CharArraySet} with the reader's words
- */
- public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
- String word = null;
- while ((word = br.readLine()) != null) {
- if (word.startsWith(comment) == false){
- result.add(word.trim());
- }
- }
- }
- finally {
- IOUtils.close(br);
- }
- return result;
- }
-
-
- /**
- * Reads stopwords from a stopword list in Snowball format.
- * <p>
- * The snowball format is the following:
- * <ul>
- * <li>Lines may contain multiple words separated by whitespace.
- * <li>The comment character is the vertical line (|).
- * <li>Lines may contain trailing comments.
- * </ul>
- *
- * @param reader Reader containing a Snowball stopword list
- * @param result the {@link CharArraySet} to fill with the readers words
- * @return the given {@link CharArraySet} with the reader's words
- */
- public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
- throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
- String line = null;
- while ((line = br.readLine()) != null) {
- int comment = line.indexOf('|');
- if (comment >= 0) line = line.substring(0, comment);
- String words[] = line.split("\\s+");
- for (int i = 0; i < words.length; i++)
- if (words[i].length() > 0) result.add(words[i]);
- }
- } finally {
- IOUtils.close(br);
- }
- return result;
- }
-
- /**
- * Reads stopwords from a stopword list in Snowball format.
- * <p>
- * The snowball format is the following:
- * <ul>
- * <li>Lines may contain multiple words separated by whitespace.
- * <li>The comment character is the vertical line (|).
- * <li>Lines may contain trailing comments.
- * </ul>
- *
- * @param reader Reader containing a Snowball stopword list
- * @return A {@link CharArraySet} with the reader's words
- */
- public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
- return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
- }
-
-
- /**
- * Reads a stem dictionary. Each line contains:
- * <pre>word<b>\t</b>stem</pre>
- * (i.e. two tab separated words)
- *
- * @return stem dictionary that overrules the stemming algorithm
- * @throws IOException If there is a low-level I/O error.
- */
- public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
- BufferedReader br = null;
- try {
- br = getBufferedReader(reader);
- String line;
- while ((line = br.readLine()) != null) {
- String[] wordstem = line.split("\t", 2);
- result.put(wordstem[0], wordstem[1]);
- }
- } finally {
- IOUtils.close(br);
- }
- return result;
- }
-
- /**
- * Accesses a resource by name and returns the (non comment) lines containing
- * data using the given character encoding.
- *
- * <p>
- * A comment line is any line that starts with the character "#"
- * </p>
- *
- * @return a list of non-blank non-comment lines with whitespace trimmed
- * @throws IOException If there is a low-level I/O error.
- */
- public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
- BufferedReader input = null;
- ArrayList<String> lines;
- boolean success = false;
- try {
- input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
-
- lines = new ArrayList<>();
- for (String word=null; (word=input.readLine())!=null;) {
- // skip initial bom marker
- if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
- word = word.substring(1);
- // skip comments
- if (word.startsWith("#")) continue;
- word=word.trim();
- // skip blank lines
- if (word.length()==0) continue;
- lines.add(word);
- }
- success = true;
- return lines;
- } finally {
- if (success) {
- IOUtils.close(input);
- } else {
- IOUtils.closeWhileHandlingException(input);
- }
- }
- }
-
- private static BufferedReader getBufferedReader(Reader reader) {
- return (reader instanceof BufferedReader) ? (BufferedReader) reader
- : new BufferedReader(reader);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
index 19a1c7e..e56071a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
@@ -36,7 +36,7 @@
* </li>
* <li>
* Effective Locale-specific normalization (case differences, diacritics, etc.).
- * ({@link org.apache.lucene.analysis.core.LowerCaseFilter} and
+ * ({@link org.apache.lucene.analysis.LowerCaseFilter} and
* {@link org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter} provide these services
* in a generic way that doesn't take into account locale-specific needs.)
* </li>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
index 9842687..2a6a1c7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
@@ -20,7 +20,7 @@ package org.apache.lucene.analysis.ar;
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
index ca15485..872e7f5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
@@ -21,11 +21,11 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Test the Arabic Normalization Filter
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
index 7f890b9..582d8e4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Test the Bulgarian analyzer
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
index daad7bb..2538717 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
@@ -22,11 +22,11 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Test the Bulgarian Stemmer
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
index a05dd0b..550a62a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
@@ -22,11 +22,11 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Test the Brazilian Stem Filter, which only modifies the term text.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
index fd65332..289f22b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
index 72c510c..1a47e42 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Most tests adopted from TestCJKTokenizer
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
index d08817c..dcb083d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
index 1171574..e940489 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
@@ -20,9 +20,9 @@ import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* Tests CommonGrams(Query)Filter
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index 98c351e..5bcfb3d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -17,18 +17,18 @@
package org.apache.lucene.analysis.commongrams;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.TestStopFilter;
+import org.apache.lucene.analysis.core.TestStopFilterFactory;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
-import java.io.StringReader;
-
/**
* Tests pretty much copied from StopFilterFactoryTest We use the test files
* used by the StopFilterFactoryTest TODO: consider creating separate test files
@@ -37,7 +37,7 @@ import java.io.StringReader;
public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testInform() throws Exception {
- ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class);
+ ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", Version.LATEST, loader,
"words", "stop-1.txt",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java
index 776365e..23d1bd4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java
@@ -16,12 +16,11 @@
*/
package org.apache.lucene.analysis.commongrams;
-
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.TestStopFilter;
+import org.apache.lucene.analysis.core.TestStopFilterFactory;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
@@ -34,7 +33,7 @@ import org.apache.lucene.util.Version;
public class TestCommonGramsQueryFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testInform() throws Exception {
- ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class);
+ ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery", Version.LATEST, loader,
"words", "stop-1.txt",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index 636d9ba..ed3abe4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -32,7 +33,6 @@ import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
[05/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
new file mode 100644
index 0000000..e7e610a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@@ -0,0 +1,669 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * A simple class that stores key Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * map, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to retrieve items
+ * by char[] keys without the necessity of converting
+ * to a String first.
+ */
+public class CharArrayMap<V> extends AbstractMap<Object,V> {
+ // private only because missing generics
+ private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
+
+ private final static int INIT_SIZE = 8;
+ private boolean ignoreCase;
+ private int count;
+ char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+ V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+
+ /**
+ * Create map with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ @SuppressWarnings("unchecked")
+ public CharArrayMap(int startSize, boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while(startSize + (startSize>>2) > size)
+ size <<= 1;
+ keys = new char[size][];
+ values = (V[]) new Object[size];
+ }
+
+ /**
+ * Creates a map from the mappings in another map.
+ *
+ * @param c
+ * a map whose mappings to be copied
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ putAll(c);
+ }
+
+ /** Create set from the supplied map (used internally for readonly maps...) */
+ private CharArrayMap(CharArrayMap<V> toCopy){
+ this.keys = toCopy.keys;
+ this.values = toCopy.values;
+ this.ignoreCase = toCopy.ignoreCase;
+ this.count = toCopy.count;
+ }
+
+ /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
+ @Override
+ public void clear() {
+ count = 0;
+ Arrays.fill(keys, null);
+ Arrays.fill(values, null);
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the {@link #keySet()} */
+ public boolean containsKey(char[] text, int off, int len) {
+ return keys[getSlot(text, off, len)] != null;
+ }
+
+ /** true if the <code>CharSequence</code> is in the {@link #keySet()} */
+ public boolean containsKey(CharSequence cs) {
+ return keys[getSlot(cs)] != null;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return containsKey(text, 0, text.length);
+ }
+ return containsKey(o.toString());
+ }
+
+ /** returns the value of the mapping of <code>len</code> chars of <code>text</code>
+ * starting at <code>off</code> */
+ public V get(char[] text, int off, int len) {
+ return values[getSlot(text, off, len)];
+ }
+
+ /** returns the value of the mapping of the chars inside this {@code CharSequence} */
+ public V get(CharSequence cs) {
+ return values[getSlot(cs)];
+ }
+
+ @Override
+ public V get(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return get(text, 0, text.length);
+ }
+ return get(o.toString());
+ }
+
+ private int getSlot(char[] text, int off, int len) {
+ int code = getHashCode(text, off, len);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, off, len, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, off, len, text2));
+ }
+ return pos;
+ }
+
+ /** Returns true if the String is in the set */
+ private int getSlot(CharSequence text) {
+ int code = getHashCode(text);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, text2));
+ }
+ return pos;
+ }
+
+ /** Add the given mapping. */
+ public V put(CharSequence text, V value) {
+ return put(text.toString(), value); // could be more efficient
+ }
+
+ @Override
+ public V put(Object o, V value) {
+ if (o instanceof char[]) {
+ return put((char[])o, value);
+ }
+ return put(o.toString(), value);
+ }
+
+ /** Add the given mapping. */
+ public V put(String text, V value) {
+ return put(text.toCharArray(), value);
+ }
+
+ /** Add the given mapping.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public V put(char[] text, V value) {
+ if (ignoreCase) {
+ CharacterUtils.toLowerCase(text, 0, text.length);
+ }
+ int slot = getSlot(text, 0, text.length);
+ if (keys[slot] != null) {
+ final V oldValue = values[slot];
+ values[slot] = value;
+ return oldValue;
+ }
+ keys[slot] = text;
+ values[slot] = value;
+ count++;
+
+ if (count + (count>>2) > keys.length) {
+ rehash();
+ }
+
+ return null;
+ }
+
+ @SuppressWarnings("unchecked")
+ private void rehash() {
+ assert keys.length == values.length;
+ final int newSize = 2*keys.length;
+ final char[][] oldkeys = keys;
+ final V[] oldvalues = values;
+ keys = new char[newSize][];
+ values = (V[]) new Object[newSize];
+
+ for(int i=0; i<oldkeys.length; i++) {
+ char[] text = oldkeys[i];
+ if (text != null) {
+ // todo: could be faster... no need to compare strings on collision
+ final int slot = getSlot(text,0,text.length);
+ keys[slot] = text;
+ values[slot] = oldvalues[i];
+ }
+ }
+ }
+
+ private boolean equals(char[] text1, int off, int len, char[] text2) {
+ if (len != text2.length)
+ return false;
+ final int limit = off+len;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = Character.codePointAt(text1, off+i, limit);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1[off+i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean equals(CharSequence text1, char[] text2) {
+ int len = text1.length();
+ if (len != text2.length)
+ return false;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = Character.codePointAt(text1, i);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1.charAt(i) != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private int getHashCode(char[] text, int offset, int len) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ final int stop = offset + len;
+ if (ignoreCase) {
+ for (int i=offset; i<stop;) {
+ final int codePointAt = Character.codePointAt(text, i, stop);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=offset; i<stop; i++) {
+ code = code*31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ private int getHashCode(CharSequence text) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ int len = text.length();
+ if (ignoreCase) {
+ for (int i=0; i<len;) {
+ int codePointAt = Character.codePointAt(text, i);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=0; i<len; i++) {
+ code = code*31 + text.charAt(i);
+ }
+ }
+ return code;
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("{");
+ for (Map.Entry<Object,V> entry : entrySet()) {
+ if (sb.length()>1) sb.append(", ");
+ sb.append(entry);
+ }
+ return sb.append('}').toString();
+ }
+
+ private EntrySet entrySet = null;
+ private CharArraySet keySet = null;
+
+ EntrySet createEntrySet() {
+ return new EntrySet(true);
+ }
+
+ @Override
+ public final EntrySet entrySet() {
+ if (entrySet == null) {
+ entrySet = createEntrySet();
+ }
+ return entrySet;
+ }
+
+ // helper for CharArraySet to not produce endless recursion
+ final Set<Object> originalKeySet() {
+ return super.keySet();
+ }
+
+ /** Returns an {@link CharArraySet} view on the map's keys.
+ * The set will use the same {@code matchVersion} as this map. */
+ @Override @SuppressWarnings({"unchecked","rawtypes"})
+ public final CharArraySet keySet() {
+ if (keySet == null) {
+ // prevent adding of entries
+ keySet = new CharArraySet((CharArrayMap) this) {
+ @Override
+ public boolean add(Object o) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(CharSequence text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(String text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(char[] text) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ return keySet;
+ }
+
+ /** public iterator class so efficient methods are exposed to users */
+ public class EntryIterator implements Iterator<Map.Entry<Object,V>> {
+ private int pos=-1;
+ private int lastPos;
+ private final boolean allowModify;
+
+ private EntryIterator(boolean allowModify) {
+ this.allowModify = allowModify;
+ goNext();
+ }
+
+ private void goNext() {
+ lastPos = pos;
+ pos++;
+ while (pos < keys.length && keys[pos] == null) pos++;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return pos < keys.length;
+ }
+
+ /** gets the next key... do not modify the returned char[] */
+ public char[] nextKey() {
+ goNext();
+ return keys[lastPos];
+ }
+
+ /** gets the next key as a newly created String object */
+ public String nextKeyString() {
+ return new String(nextKey());
+ }
+
+ /** returns the value associated with the last key returned */
+ public V currentValue() {
+ return values[lastPos];
+ }
+
+ /** sets the value associated with the last key returned */
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ V old = values[lastPos];
+ values[lastPos] = value;
+ return old;
+ }
+
+ /** use nextCharArray() + currentValue() for better efficiency. */
+ @Override
+ public Map.Entry<Object,V> next() {
+ goNext();
+ return new MapEntry(lastPos, allowModify);
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private final class MapEntry implements Map.Entry<Object,V> {
+ private final int pos;
+ private final boolean allowModify;
+
+ private MapEntry(int pos, boolean allowModify) {
+ this.pos = pos;
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public Object getKey() {
+ // we must clone here, as putAll to another CharArrayMap
+ // with other case sensitivity flag would corrupt the keys
+ return keys[pos].clone();
+ }
+
+ @Override
+ public V getValue() {
+ return values[pos];
+ }
+
+ @Override
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ final V old = values[pos];
+ values[pos] = value;
+ return old;
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append(keys[pos]).append('=')
+ .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos])
+ .toString();
+ }
+ }
+
+ /** public EntrySet class so efficient methods are exposed to users */
+ public final class EntrySet extends AbstractSet<Map.Entry<Object,V>> {
+ private final boolean allowModify;
+
+ private EntrySet(boolean allowModify) {
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public EntryIterator iterator() {
+ return new EntryIterator(allowModify);
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public boolean contains(Object o) {
+ if (!(o instanceof Map.Entry))
+ return false;
+ final Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+ final Object key = e.getKey();
+ final Object val = e.getValue();
+ final Object v = get(key);
+ return v == null ? val == null : v.equals(val);
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public void clear() {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ CharArrayMap.this.clear();
+ }
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArrayMap}. This allows to provide
+ * unmodifiable views of internal map for "read-only" use.
+ *
+ * @param map
+ * a map for which the unmodifiable map is returned.
+ * @return an new unmodifiable {@link CharArrayMap}.
+ * @throws NullPointerException
+ * if the given map is <code>null</code>.
+ */
+ public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
+ if (map == null)
+ throw new NullPointerException("Given map is null");
+ if (map == emptyMap() || map.isEmpty())
+ return emptyMap();
+ if (map instanceof UnmodifiableCharArrayMap)
+ return map;
+ return new UnmodifiableCharArrayMap<>(map);
+ }
+
+ /**
+ * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property will be preserved.
+ *
+ * @param map
+ * a map to copy
+ * @return a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property as well as the
+ * matchVersion will be of the given map will be preserved.
+ */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) {
+ if(map == EMPTY_MAP)
+ return emptyMap();
+ if(map instanceof CharArrayMap) {
+ CharArrayMap<V> m = (CharArrayMap<V>) map;
+ // use fast path instead of iterating all values
+ // this is even on very small sets ~10 times faster than iterating
+ final char[][] keys = new char[m.keys.length][];
+ System.arraycopy(m.keys, 0, keys, 0, keys.length);
+ final V[] values = (V[]) new Object[m.values.length];
+ System.arraycopy(m.values, 0, values, 0, values.length);
+ m = new CharArrayMap<>(m);
+ m.keys = keys;
+ m.values = values;
+ return m;
+ }
+ return new CharArrayMap<>(map, false);
+ }
+
+ /** Returns an empty, unmodifiable map. */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> emptyMap() {
+ return (CharArrayMap<V>) EMPTY_MAP;
+ }
+
+ // package private CharArraySet instanceof check in CharArraySet
+ static class UnmodifiableCharArrayMap<V> extends CharArrayMap<V> {
+
+ UnmodifiableCharArrayMap(CharArrayMap<V> map) {
+ super(map);
+ }
+
+ @Override
+ public void clear() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(Object o, V val){
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(char[] text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(CharSequence text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(String text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ EntrySet createEntrySet() {
+ return new EntrySet(false);
+ }
+ }
+
+ /**
+ * Empty {@link org.apache.lucene.analysis.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
+ * Contains checks will always return <code>false</code> or throw
+ * NPE if necessary.
+ */
+ private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
+ EmptyCharArrayMap() {
+ super(new CharArrayMap<V>(0, false));
+ }
+
+ @Override
+ public boolean containsKey(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public V get(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
new file mode 100644
index 0000000..4c8066a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A simple class that stores Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
+ *
+ * <P>
+ * <em>Please note:</em> This class implements {@link java.util.Set Set} but
+ * does not behave like it should in all cases. The generic type is
+ * {@code Set<Object>}, because you can add any object to it,
+ * that has a string representation. The add methods will use
+ * {@link Object#toString} and store the result using a {@code char[]}
+ * buffer. The same behavior have the {@code contains()} methods.
+ * The {@link #iterator()} returns an {@code Iterator<char[]>}.
+ */
+public class CharArraySet extends AbstractSet<Object> {
+
+ /** An empty {@code CharArraySet}. */
+ public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+
+ private static final Object PLACEHOLDER = new Object();
+
+ private final CharArrayMap<Object> map;
+
+ /**
+ * Create set with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(int startSize, boolean ignoreCase) {
+ this(new CharArrayMap<>(startSize, ignoreCase));
+ }
+
+ /**
+ * Creates a set from a Collection of objects.
+ *
+ * @param c
+ * a collection whose elements to be placed into the set
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(Collection<?> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ addAll(c);
+ }
+
+ /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+ CharArraySet(final CharArrayMap<Object> map){
+ this.map = map;
+ }
+
+ /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
+ @Override
+ public void clear() {
+ map.clear();
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the set */
+ public boolean contains(char[] text, int off, int len) {
+ return map.containsKey(text, off, len);
+ }
+
+ /** true if the <code>CharSequence</code> is in the set */
+ public boolean contains(CharSequence cs) {
+ return map.containsKey(cs);
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return map.containsKey(o);
+ }
+
+ @Override
+ public boolean add(Object o) {
+ return map.put(o, PLACEHOLDER) == null;
+ }
+
+ /** Add this CharSequence into the set */
+ public boolean add(CharSequence text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this String into the set */
+ public boolean add(String text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this char[] directly to the set.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public boolean add(char[] text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+ * unmodifiable views of internal sets for "read-only" use.
+ *
+ * @param set
+ * a set for which the unmodifiable set is returned.
+ * @return an new unmodifiable {@link CharArraySet}.
+ * @throws NullPointerException
+ * if the given set is <code>null</code>.
+ */
+ public static CharArraySet unmodifiableSet(CharArraySet set) {
+ if (set == null)
+ throw new NullPointerException("Given set is null");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
+ return set;
+ return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
+ }
+
+ /**
+ * Returns a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property will be preserved.
+ *
+ * @param set
+ * a set to copy
+ * @return a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property as well as the
+ * matchVersion will be of the given set will be preserved.
+ */
+ public static CharArraySet copy(final Set<?> set) {
+ if(set == EMPTY_SET)
+ return EMPTY_SET;
+ if(set instanceof CharArraySet) {
+ final CharArraySet source = (CharArraySet) set;
+ return new CharArraySet(CharArrayMap.copy(source.map));
+ }
+ return new CharArraySet(set, false);
+ }
+
+ /**
+ * Returns an {@link Iterator} for {@code char[]} instances in this set.
+ */
+ @Override @SuppressWarnings("unchecked")
+ public Iterator<Object> iterator() {
+ // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+ return map.originalKeySet().iterator();
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("[");
+ for (Object item : this) {
+ if (sb.length()>1) sb.append(", ");
+ if (item instanceof char[]) {
+ sb.append((char[]) item);
+ } else {
+ sb.append(item);
+ }
+ }
+ return sb.append(']').toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
new file mode 100644
index 0000000..e2cc47f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Utility class to write tokenizers or token filters.
+ * @lucene.internal
+ */
+public final class CharacterUtils {
+
+ private CharacterUtils() {} // no instantiation
+
+ /**
+ * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+ * of the given bufferSize.
+ *
+ * @param bufferSize
+ * the internal char buffer size, must be <code>>= 2</code>
+ * @return a new {@link CharacterBuffer} instance.
+ */
+ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+ if (bufferSize < 2) {
+ throw new IllegalArgumentException("buffersize must be >= 2");
+ }
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+
+
+ /**
+ * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to lowercase
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ Character.codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /**
+ * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to UPPERCASE
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toUpperCase(
+ Character.codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /** Converts a sequence of Java characters to a sequence of unicode code points.
+ * @return the number of code points written to the destination buffer */
+ public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount = 0;
+ for (int i = 0; i < srcLen; ) {
+ final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int charCount = Character.charCount(cp);
+ dest[destOff + codePointCount++] = cp;
+ i += charCount;
+ }
+ return codePointCount;
+ }
+
+ /** Converts a sequence of unicode code points to a sequence of Java characters.
+ * @return the number of chars written to the destination buffer */
+ public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i) {
+ written += Character.toChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
+ /**
+ * Fills the {@link CharacterBuffer} with characters read from the given
+ * reader {@link Reader}. This method tries to read <code>numChars</code>
+ * characters into the {@link CharacterBuffer}, each call to fill will start
+ * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ * In case code points can span across 2 java characters, this method may
+ * only fill <code>numChars - 1</code> characters in order not to split in
+ * the middle of a surrogate pair, even if there are remaining characters in
+ * the {@link Reader}.
+ * <p>
+ * This method guarantees
+ * that the given {@link CharacterBuffer} will never contain a high surrogate
+ * character as the last element in the buffer unless it is the last available
+ * character in the reader. In other words, high and low surrogate pairs will
+ * always be preserved across buffer boarders.
+ * </p>
+ * <p>
+ * A return value of <code>false</code> means that this method call exhausted
+ * the reader, but there may be some bytes which have been read, which can be
+ * verified by checking whether <code>buffer.getLength() > 0</code>.
+ * </p>
+ *
+ * @param buffer
+ * the buffer to fill.
+ * @param reader
+ * the reader to read characters from.
+ * @param numChars
+ * the number of chars to read
+ * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+ * @throws IOException
+ * if the reader throws an {@link IOException}.
+ */
+ public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
+
+ /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+ public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ return fill(buffer, reader, buffer.buffer.length);
+ }
+
+ static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+ int read = 0;
+ while (read < len) {
+ final int r = reader.read(dest, offset + read, len - read);
+ if (r == -1) {
+ break;
+ }
+ read += r;
+ }
+ return read;
+ }
+
+ /**
+ * A simple IO buffer to use with
+ * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+ */
+ public static final class CharacterBuffer {
+
+ private final char[] buffer;
+ private int offset;
+ private int length;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ char lastTrailingHighSurrogate;
+
+ CharacterBuffer(char[] buffer, int offset, int length) {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /**
+ * Returns the internal buffer
+ *
+ * @return the buffer
+ */
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Returns the data offset in the internal buffer.
+ *
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Return the length of the data in the internal buffer starting at
+ * {@link #getOffset()}
+ *
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Resets the CharacterBuffer. All internals are reset to its default
+ * values.
+ */
+ public void reset() {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = 0;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
new file mode 100644
index 0000000..cecad10
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * Abstract base class for TokenFilters that may remove tokens.
+ * You have to implement {@link #accept} and return a boolean if the current
+ * token should be preserved. {@link #incrementToken} uses this method
+ * to decide if a token should be passed to the caller.
+ */
+public abstract class FilteringTokenFilter extends TokenFilter {
+
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private int skippedPositions;
+
+ /**
+ * Create a new {@link FilteringTokenFilter}.
+ * @param in the {@link TokenStream} to consume
+ */
+ public FilteringTokenFilter(TokenStream in) {
+ super(in);
+ }
+
+ /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
+ protected abstract boolean accept() throws IOException;
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.getPositionIncrement();
+ }
+
+ // reached EOS -- return false
+ return false;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ skippedPositions = 0;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
new file mode 100644
index 0000000..b86684d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * Normalizes token text to lower case.
+ */
+public final class LowerCaseFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Create a new LowerCaseFilter, that normalizes token text to lower case.
+ *
+ * @param in TokenStream to filter
+ */
+ public LowerCaseFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+ return true;
+ } else
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
new file mode 100644
index 0000000..79707bc
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Removes stop words from a token stream.
+ */
+public final class StopFilter extends FilteringTokenFilter {
+
+ private final CharArraySet stopWords;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Constructs a filter which removes words from the input TokenStream that are
+ * named in the Set.
+ *
+ * @param in
+ * Input stream
+ * @param stopWords
+ * A {@link CharArraySet} representing the stopwords.
+ * @see #makeStopSet(java.lang.String...)
+ */
+ public StopFilter(TokenStream in, CharArraySet stopWords) {
+ super(in);
+ this.stopWords = stopWords;
+ }
+
+ /**
+ * Builds a Set from an array of stop words,
+ * appropriate for passing into the StopFilter constructor.
+ * This permits this stopWords construction to be cached once when
+ * an Analyzer is constructed.
+ *
+ * @param stopWords An array of stopwords
+ * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+ */
+ public static CharArraySet makeStopSet(String... stopWords) {
+ return makeStopSet(stopWords, false);
+ }
+
+ /**
+ * Builds a Set from an array of stop words,
+ * appropriate for passing into the StopFilter constructor.
+ * This permits this stopWords construction to be cached once when
+ * an Analyzer is constructed.
+ *
+ * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+ * @return A Set ({@link CharArraySet}) containing the words
+ * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+ */
+ public static CharArraySet makeStopSet(List<?> stopWords) {
+ return makeStopSet(stopWords, false);
+ }
+
+ /**
+ * Creates a stopword set from the given stopword array.
+ *
+ * @param stopWords An array of stopwords
+ * @param ignoreCase If true, all words are lower cased first.
+ * @return a Set containing the words
+ */
+ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
+ CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
+ stopSet.addAll(Arrays.asList(stopWords));
+ return stopSet;
+ }
+
+ /**
+ * Creates a stopword set from the given stopword list.
+ * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+ * @param ignoreCase if true, all words are lower cased first
+ * @return A Set ({@link CharArraySet}) containing the words
+ */
+ public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
+ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
+ stopSet.addAll(stopWords);
+ return stopSet;
+ }
+
+ /**
+ * Returns the next input Token whose term() is not a stop word.
+ */
+ @Override
+ protected boolean accept() {
+ return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
new file mode 100644
index 0000000..c35e715
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets.
+ *
+ */
+public abstract class StopwordAnalyzerBase extends Analyzer {
+
+ /**
+ * An immutable stopword set
+ */
+ protected final CharArraySet stopwords;
+
+ /**
+ * Returns the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ *
+ * @return the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ */
+ public CharArraySet getStopwordSet() {
+ return stopwords;
+ }
+
+ /**
+ * Creates a new instance initialized with the given stopword set
+ *
+ * @param stopwords
+ * the analyzer's stopword set
+ */
+ protected StopwordAnalyzerBase(final CharArraySet stopwords) {
+ // analyzers should use char array set for stopwords!
+ this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+ .unmodifiableSet(CharArraySet.copy(stopwords));
+ }
+
+ /**
+ * Creates a new Analyzer with an empty stopword set
+ */
+ protected StopwordAnalyzerBase() {
+ this(null);
+ }
+
+ /**
+ * Creates a CharArraySet from a file resource associated with a class. (See
+ * {@link Class#getResourceAsStream(String)}).
+ *
+ * @param ignoreCase
+ * <code>true</code> if the set should ignore the case of the
+ * stopwords, otherwise <code>false</code>
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param resource
+ * name of the resource file associated with the given class
+ * @param comment
+ * comment string to ignore in the stopword file
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+ final Class<? extends Analyzer> aClass, final String resource,
+ final String comment) throws IOException {
+ Reader reader = null;
+ try {
+ reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
+ return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase));
+ } finally {
+ IOUtils.close(reader);
+ }
+
+ }
+
+ /**
+ * Creates a CharArraySet from a path.
+ *
+ * @param stopwords
+ * the stopwords file to load
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(Path stopwords) throws IOException {
+ Reader reader = null;
+ try {
+ reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8);
+ return WordlistLoader.getWordSet(reader);
+ } finally {
+ IOUtils.close(reader);
+ }
+ }
+
+ /**
+ * Creates a CharArraySet from a file.
+ *
+ * @param stopwords
+ * the stopwords reader to load
+ *
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * reader
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(Reader stopwords) throws IOException {
+ try {
+ return WordlistLoader.getWordSet(stopwords);
+ } finally {
+ IOUtils.close(stopwords);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
new file mode 100644
index 0000000..2397e66
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Loader for text files that represent a list of stopwords.
+ *
+ * @see IOUtils to obtain {@link Reader} instances
+ * @lucene.internal
+ */
+public class WordlistLoader {
+
+ private static final int INITIAL_CAPACITY = 16;
+
+ /** no instance */
+ private WordlistLoader() {}
+
+ /**
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ result.add(word.trim());
+ }
+ }
+ finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @return A {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader) throws IOException {
+ return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+ /**
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param comment The string representing a comment.
+ * @return A CharArraySet with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
+ return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+ /**
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param comment The string representing a comment.
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ if (word.startsWith(comment) == false){
+ result.add(word.trim());
+ }
+ }
+ }
+ finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
+ throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ int comment = line.indexOf('|');
+ if (comment >= 0) line = line.substring(0, comment);
+ String words[] = line.split("\\s+");
+ for (int i = 0; i < words.length; i++)
+ if (words[i].length() > 0) result.add(words[i]);
+ }
+ } finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @return A {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
+ return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+
+ /**
+ * Reads a stem dictionary. Each line contains:
+ * <pre>word<b>\t</b>stem</pre>
+ * (i.e. two tab separated words)
+ *
+ * @return stem dictionary that overrules the stemming algorithm
+ * @throws IOException If there is a low-level I/O error.
+ */
+ public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] wordstem = line.split("\t", 2);
+ result.put(wordstem[0], wordstem[1]);
+ }
+ } finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Accesses a resource by name and returns the (non comment) lines containing
+ * data using the given character encoding.
+ *
+ * <p>
+ * A comment line is any line that starts with the character "#"
+ * </p>
+ *
+ * @return a list of non-blank non-comment lines with whitespace trimmed
+ * @throws IOException If there is a low-level I/O error.
+ */
+ public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
+ BufferedReader input = null;
+ ArrayList<String> lines;
+ boolean success = false;
+ try {
+ input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
+
+ lines = new ArrayList<>();
+ for (String word=null; (word=input.readLine())!=null;) {
+ // skip initial bom marker
+ if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
+ word = word.substring(1);
+ // skip comments
+ if (word.startsWith("#")) continue;
+ word=word.trim();
+ // skip blank lines
+ if (word.length()==0) continue;
+ lines.add(word);
+ }
+ success = true;
+ return lines;
+ } finally {
+ if (success) {
+ IOUtils.close(input);
+ } else {
+ IOUtils.closeWhileHandlingException(input);
+ }
+ }
+ }
+
+ private static BufferedReader getBufferedReader(Reader reader) {
+ return (reader instanceof BufferedReader) ? (BufferedReader) reader
+ : new BufferedReader(reader);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
index 511f268..81858df 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
@@ -156,7 +156,7 @@
* over and over in many places, you can make a subclass of
* {@link org.apache.lucene.analysis.Analyzer}. In fact, Apache Lucene
* supplies a large family of <code>Analyzer</code> classes that deliver useful
- * analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
+ * analysis chains. The most common of these is the <a href="{@docRoot}/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
* Many applications will have a long and industrious life with nothing more
* than the <code>StandardAnalyzer</code>. The <a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a>
* library provides many pre-existing analyzers for various languages.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
new file mode 100644
index 0000000..251017d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
+ * LowerCaseFilter} and {@link StopFilter}, using a list of
+ * English stop words.
+ */
+public final class StandardAnalyzer extends StopwordAnalyzerBase {
+
+ /** An unmodifiable set containing some common English words that are not usually useful
+ for searching.*/
+ public static final CharArraySet ENGLISH_STOP_WORDS_SET;
+
+ static {
+ final List<String> stopWords = Arrays.asList(
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ );
+ final CharArraySet stopSet = new CharArraySet(stopWords, false);
+ ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+ }
+
+ /** Default maximum allowed token length */
+ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /** An unmodifiable set containing some common English words that are usually not
+ useful for searching. */
+ public static final CharArraySet STOP_WORDS_SET = ENGLISH_STOP_WORDS_SET;
+
+ /** Builds an analyzer with the given stop words.
+ * @param stopWords stop words */
+ public StandardAnalyzer(CharArraySet stopWords) {
+ super(stopWords);
+ }
+
+ /** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
+ */
+ public StandardAnalyzer() {
+ this(STOP_WORDS_SET);
+ }
+
+ /** Builds an analyzer with the stop words from the given reader.
+ * @see WordlistLoader#getWordSet(Reader)
+ * @param stopwords Reader to read stop words from */
+ public StandardAnalyzer(Reader stopwords) throws IOException {
+ this(loadStopwordSet(stopwords));
+ }
+
+ /**
+ * Set maximum allowed token length. If a token is seen
+ * that exceeds this length then it is discarded. This
+ * setting only takes effect the next time tokenStream or
+ * tokenStream is called.
+ */
+ public void setMaxTokenLength(int length) {
+ maxTokenLength = length;
+ }
+
+ /** Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName) {
+ final StandardTokenizer src = new StandardTokenizer();
+ src.setMaxTokenLength(maxTokenLength);
+ TokenStream tok = new StandardFilter(src);
+ tok = new LowerCaseFilter(tok);
+ tok = new StopFilter(tok, stopwords);
+ return new TokenStreamComponents(src, tok) {
+ @Override
+ protected void setReader(final Reader reader) {
+ src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
+ super.setReader(reader);
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
new file mode 100644
index 0000000..202db37
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes tokens extracted with {@link StandardTokenizer}.
+ */
+public class StandardFilter extends TokenFilter {
+
+ /** Sole constructor */
+ public StandardFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ return input.incrementToken(); // TODO: add some niceties for the new grammar
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
new file mode 100644
index 0000000..5c5169a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeFactory;
+
+/** A grammar-based tokenizer constructed with JFlex.
+ * <p>
+ * This class implements the Word Break rules from the
+ * Unicode Text Segmentation algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+
+public final class StandardTokenizer extends Tokenizer {
+ /** A private instance of the JFlex-constructed scanner */
+ private StandardTokenizerImpl scanner;
+
+ // TODO: how can we remove these old types?!
+ /** Alpha/numeric token type */
+ public static final int ALPHANUM = 0;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int APOSTROPHE = 1;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int ACRONYM = 2;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int COMPANY = 3;
+ /** Email token type */
+ public static final int EMAIL = 4;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int HOST = 5;
+ /** Numeric token type */
+ public static final int NUM = 6;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int CJ = 7;
+
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int ACRONYM_DEP = 8;
+
+ /** Southeast Asian token type */
+ public static final int SOUTHEAST_ASIAN = 9;
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC = 10;
+ /** Hiragana token type */
+ public static final int HIRAGANA = 11;
+ /** Katakana token type */
+ public static final int KATAKANA = 12;
+
+ /** Hangul token type */
+ public static final int HANGUL = 13;
+
+ /** String token types that correspond to token type int constants */
+ public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>",
+ "<SOUTHEAST_ASIAN>",
+ "<IDEOGRAPHIC>",
+ "<HIRAGANA>",
+ "<KATAKANA>",
+ "<HANGUL>"
+ };
+
+ /** Absolute maximum sized token */
+ public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
+
+ private int skippedPositions;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /**
+ * Set the max allowed token length. No tokens longer than this are emitted.
+ *
+ * @throws IllegalArgumentException if the given length is outside of the
+ * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+ */
+ public void setMaxTokenLength(int length) {
+ if (length < 1) {
+ throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+ } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+ throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+ }
+ if (length != maxTokenLength) {
+ maxTokenLength = length;
+ scanner.setBufferSize(length);
+ }
+ }
+
+ /** Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ /**
+ * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
+ * the <code>input</code> to the newly created JFlex scanner.
+
+ * See http://issues.apache.org/jira/browse/LUCENE-1068
+ */
+ public StandardTokenizer() {
+ init();
+ }
+
+ /**
+ * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
+ */
+ public StandardTokenizer(AttributeFactory factory) {
+ super(factory);
+ init();
+ }
+
+ private void init() {
+ this.scanner = new StandardTokenizerImpl(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ skippedPositions = 0;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(skippedPositions+1);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ skippedPositions++;
+ }
+ }
+
+ @Override
+ public final void end() throws IOException {
+ super.end();
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ // adjust any skipped tokens
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ scanner.yyreset(input);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+}
[04/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
new file mode 100644
index 0000000..5d7b240
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -0,0 +1,823 @@
+/* The following code was generated by JFlex 1.6.0 */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation
+ * algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
+ * <li><NUM>: A number</li>
+ * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
+ * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
+ * <li><HIRAGANA>: A single hiragana character</li>
+ * <li><KATAKANA>: A sequence of katakana characters</li>
+ * <li><HANGUL>: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+
+public final class StandardTokenizerImpl {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private int ZZ_BUFFERSIZE = 255;
+
+ /** lexical states */
+ public static final int YYINITIAL = 0;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
+ "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
+ "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
+ "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
+ "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
+ "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
+ "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
+ "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
+ "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
+ "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
+ "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
+ "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
+ "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
+ "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
+ "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
+ "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
+ "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
+ "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
+ "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
+ "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
+ "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
+ "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
+ "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
+ "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
+ "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
+ "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
+ "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
+ "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
+ "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
+ "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
+ "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
+ "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
+ "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
+ "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
+ "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
+ "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
+ "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
+ "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
+ "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
+ "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
+ "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
+ "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
+ "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
+ "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
+ "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
+ "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
+ "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
+ "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
+ "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
+ "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
+ "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
+ "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
+ "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
+ "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
+ "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
+ "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
+ "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
+ "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
+ "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
+ "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
+ "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
+ "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
+ "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
+ "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
+ "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
+ "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
+ "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
+ "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
+ "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
+ "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
+ "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
+ "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
+ "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
+ "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
+ "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
+ "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
+ "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
+ "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
+ "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
+ "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
+ "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
+ "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
+ "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
+ "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
+ "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
+ "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
+ "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
+ "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
+ "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
+ "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
+ "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
+ "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
+ "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
+ "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
+ "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
+ "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
+ "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
+ "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
+ "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
+ "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
+ "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
+ "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
+ "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
+ "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
+ "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
+ "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
+ "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
+ "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
+ "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
+ "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
+ "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
+ "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
+ "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
+ "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
+ "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
+ "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
+ "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
+ "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
+ "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
+ "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
+ "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
+ "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
+ "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
+ "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
+ "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
+ "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
+ "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
+ "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
+ "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
+ "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
+ "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
+ "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
+ "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
+ "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
+ "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
+ "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
+ "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
+ "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+ "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
+ "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
+ "\1\4\1\0\2\2\2\0\1\1\1\0";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
+ "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
+ "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
+ "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
+ "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
+ "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
+ "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
+ "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
+ "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
+ "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
+ "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
+ "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
+ "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
+ "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
+ "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
+ "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
+ "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
+ "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
+ "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
+ "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
+ "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
+ "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
+ "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
+ "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
+ "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
+ "\1\30\1\15\14\0\1\30";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[396];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
+ "\2\1\2\0\1\1\1\0";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead.
+ * When a lead/high surrogate has been read from the input stream
+ * into the final zzBuffer position, this will have a value of 1;
+ * otherwise, it will have a value of 0.
+ */
+ private int zzFinalHighSurrogate = 0;
+
+ /* user code: */
+ /** Alphanumeric sequences */
+ public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+
+ /** Numbers */
+ public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+
+ /**
+ * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+ * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
+ * together as as a single token rather than broken up, because the logic
+ * required to break them at word boundaries is too complex for UAX#29.
+ * <p>
+ * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+ */
+ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+
+ /** Hiragana token type */
+ public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+
+ /** Katakana token type */
+ public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+ /** Hangul token type */
+ public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Character count processed so far */
+ public final int yychar()
+ {
+ return yychar;
+ }
+
+ /**
+ * Fills CharTermAttribute with the current token text.
+ */
+ public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+ }
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
+
+
+ /**
+ * Creates a new scanner
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public StandardTokenizerImpl(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x110000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 2836) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+
+ /* fill the buffer with new input */
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+ int totalRead = 0;
+ while (totalRead < requested) {
+ int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
+ if (numRead == -1) {
+ break;
+ }
+ totalRead += numRead;
+ }
+
+ if (totalRead > 0) {
+ zzEndRead += totalRead;
+ if (totalRead == requested) { /* possibly more input available */
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ --zzEndRead;
+ zzFinalHighSurrogate = 1;
+ if (totalRead == 1) { return true; }
+ }
+ }
+ return false;
+ }
+
+ // totalRead = 0: End of stream
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * Internal scan buffer is resized down to its initial length, if it has grown.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ zzFinalHighSurrogate = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ if (zzBuffer.length > ZZ_BUFFERSIZE)
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public int getNextToken() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar+= zzMarkedPosL-zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL) {
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
+ }
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ }
+ case 9: break;
+ case 2:
+ { return WORD_TYPE;
+ }
+ case 10: break;
+ case 3:
+ { return HANGUL_TYPE;
+ }
+ case 11: break;
+ case 4:
+ { return NUMERIC_TYPE;
+ }
+ case 12: break;
+ case 5:
+ { return KATAKANA_TYPE;
+ }
+ case 13: break;
+ case 6:
+ { return IDEOGRAPHIC_TYPE;
+ }
+ case 14: break;
+ case 7:
+ { return HIRAGANA_TYPE;
+ }
+ case 15: break;
+ case 8:
+ { return SOUTH_EAST_ASIAN_TYPE;
+ }
+ case 16: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ {
+ return YYEOF;
+ }
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
new file mode 100644
index 0000000..24c401d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation
+ * algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
+ * <li><NUM>: A number</li>
+ * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
+ * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
+ * <li><HIRAGANA>: A single hiragana character</li>
+ * <li><KATAKANA>: A sequence of katakana characters</li>
+ * <li><HANGUL>: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.3
+%integer
+%final
+%public
+%class StandardTokenizerImpl
+%function getNextToken
+%char
+%buffer 255
+
+// UAX#29 WB4. X (Extend | Format)* --> X
+//
+HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
+HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
+NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
+MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
+HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
+HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
+SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
+DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
+HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
+RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
+ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
+
+%{
+ /** Alphanumeric sequences */
+ public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+
+ /** Numbers */
+ public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+
+ /**
+ * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+ * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
+ * together as as a single token rather than broken up, because the logic
+ * required to break them at word boundaries is too complex for UAX#29.
+ * <p>
+ * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+ */
+ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+
+ /** Hiragana token type */
+ public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+
+ /** Katakana token type */
+ public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+ /** Hangul token type */
+ public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Character count processed so far */
+ public final int yychar()
+ {
+ return yychar;
+ }
+
+ /**
+ * Fills CharTermAttribute with the current token text.
+ */
+ public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+ }
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
+%}
+
+%%
+
+// UAX#29 WB1. sot �
+// WB2. � eot
+//
+<<EOF>> { return YYEOF; }
+
+// UAX#29 WB8. Numeric � Numeric
+// WB11. Numeric (MidNum | MidNumLet | Single_Quote) � Numeric
+// WB12. Numeric � (MidNum | MidNumLet | Single_Quote) Numeric
+// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
+ { return NUMERIC_TYPE; }
+
+// subset of the below for typing purposes only!
+{HangulEx}+
+ { return HANGUL_TYPE; }
+
+{KatakanaEx}+
+ { return KATAKANA_TYPE; }
+
+// UAX#29 WB5. (ALetter | Hebrew_Letter) � (ALetter | Hebrew_Letter)
+// WB6. (ALetter | Hebrew_Letter) � (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
+// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) � (ALetter | Hebrew_Letter)
+// WB7a. Hebrew_Letter � Single_Quote
+// WB7b. Hebrew_Letter � Double_Quote Hebrew_Letter
+// WB7c. Hebrew_Letter Double_Quote � Hebrew_Letter
+// WB9. (ALetter | Hebrew_Letter) � Numeric
+// WB10. Numeric � (ALetter | Hebrew_Letter)
+// WB13. Katakana � Katakana
+// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ )+
+ )
+({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ )+
+ )
+)*
+{ExtendNumLetEx}*
+ { return WORD_TYPE; }
+
+
+// From UAX #29:
+//
+// [C]haracters with the Line_Break property values of Contingent_Break (CB),
+// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
+// boundary property values based on criteria outside of the scope of this
+// annex. That means that satisfactory treatment of languages like Chinese
+// or Thai requires special handling.
+//
+// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// property: U+FFFC ( \ufffc ) OBJECT REPLACEMENT CHARACTER.
+//
+// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
+// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
+// Lao, etc.) are kept together. This grammar does the same below.
+//
+// See also the Unicode Line Breaking Algorithm:
+//
+// http://www.unicode.org/reports/tr14/#SA
+//
+{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
+
+// UAX#29 WB14. Any � Any
+//
+{HanEx} { return IDEOGRAPHIC_TYPE; }
+{HiraganaEx} { return HIRAGANA_TYPE; }
+
+
+// UAX#29 WB3. CR � LF
+// WB3a. (Newline | CR | LF) �
+// WB3b. � (Newline | CR | LF)
+// WB13c. Regional_Indicator � Regional_Indicator
+// WB14. Any � Any
+//
+{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
new file mode 100644
index 0000000..39ce8f9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Fast, general-purpose grammar-based tokenizer {@link org.apache.lucene.analysis.standard.StandardTokenizer}
+ * implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * Unlike <code>UAX29URLEmailTokenizer</code> from the analysis module, URLs and email addresses are
+ * <b>not</b> tokenized as single tokens, but are instead split up into
+ * tokens according to the UAX#29 word break rules.
+ * <br>
+ * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes
+ * {@link org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer},
+ * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
+ * {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
+ * and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ */
+
+package org.apache.lucene.analysis.standard;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 50d2482..368259a 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -21,6 +21,7 @@ import java.io.PrintStream;
import java.util.EnumSet;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.Sort;
@@ -121,7 +122,21 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
}
/**
- * Creates a new config that with the default {@link
+ * Creates a new config, using {@link StandardAnalyzer} as the
+ * analyzer. By default, {@link TieredMergePolicy} is used
+ * for merging;
+ * Note that {@link TieredMergePolicy} is free to select
+ * non-contiguous merges, which means docIDs may not
+ * remain monotonic over time. If this is a problem you
+ * should switch to {@link LogByteSizeMergePolicy} or
+ * {@link LogDocMergePolicy}.
+ */
+ public IndexWriterConfig() {
+ this(new StandardAnalyzer());
+ }
+
+ /**
+ * Creates a new config that with the provided {@link
* Analyzer}. By default, {@link TieredMergePolicy} is used
* for merging;
* Note that {@link TieredMergePolicy} is free to select
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
index 3fda7c3..82281a9e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
@@ -62,7 +62,7 @@ final class Direct16 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
index aec9eaf..502aa3f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
@@ -62,7 +62,7 @@ final class Direct32 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
index b8e06b6..106f641 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
@@ -57,7 +57,7 @@ final class Direct64 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
index 81fc5a9..27986c0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
@@ -60,7 +60,7 @@ final class Direct8 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
index 02f4e41..8e8e94d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
@@ -112,7 +112,7 @@ final class Packed16ThreeBlocks extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
index 85e7ea8..a7262b3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
@@ -61,7 +61,7 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
index 3ec6df0..5a85735 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
@@ -110,7 +110,7 @@ final class Packed8ThreeBlocks extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/overview.html b/lucene/core/src/java/overview.html
index 9086cf9..b7112ac 100644
--- a/lucene/core/src/java/overview.html
+++ b/lucene/core/src/java/overview.html
@@ -78,7 +78,7 @@ to the output of a {@link org.apache.lucene.analysis.Tokenizer Tokenizer}.
Tokenizers and TokenFilters are strung together and applied with an {@link org.apache.lucene.analysis.Analyzer Analyzer}.
<a href="../analyzers-common/overview-summary.html">analyzers-common</a> provides a number of Analyzer implementations, including
<a href="../analyzers-common/org/apache/lucene/analysis/core/StopAnalyzer.html">StopAnalyzer</a>
-and the grammar-based <a href="../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
+and the grammar-based <a href="org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
<li>
<b>{@link org.apache.lucene.codecs}</b>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
new file mode 100644
index 0000000..2d63b66
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayMap extends LuceneTestCase {
+ public void doRandom(int iter, boolean ignoreCase) {
+ CharArrayMap<Integer> map = new CharArrayMap<>(1, ignoreCase);
+ HashMap<String,Integer> hmap = new HashMap<>();
+
+ char[] key;
+ for (int i=0; i<iter; i++) {
+ int len = random().nextInt(5);
+ key = new char[len];
+ for (int j=0; j<key.length; j++) {
+ key[j] = (char)random().nextInt(127);
+ }
+ String keyStr = new String(key);
+ String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr;
+
+ int val = random().nextInt();
+
+ Object o1 = map.put(key, val);
+ Object o2 = hmap.put(hmapKey,val);
+ assertEquals(o1,o2);
+
+ // add it again with the string method
+ assertEquals(val, map.put(keyStr,val).intValue());
+
+ assertEquals(val, map.get(key,0,key.length).intValue());
+ assertEquals(val, map.get(key).intValue());
+ assertEquals(val, map.get(keyStr).intValue());
+
+ assertEquals(hmap.size(), map.size());
+ }
+ }
+
+ public void testCharArrayMap() {
+ int num = 5 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < num; i++) { // pump this up for more random testing
+ doRandom(1000,false);
+ doRandom(1000,true);
+ }
+ }
+
+ public void testMethods() {
+ CharArrayMap<Integer> cm = new CharArrayMap<>(2, false);
+ HashMap<String,Integer> hm = new HashMap<>();
+ hm.put("foo",1);
+ hm.put("bar",2);
+ cm.putAll(hm);
+ assertEquals(hm.size(), cm.size());
+ hm.put("baz", 3);
+ cm.putAll(hm);
+ assertEquals(hm.size(), cm.size());
+
+ CharArraySet cs = cm.keySet();
+ int n=0;
+ for (Object o : cs) {
+ assertTrue(cm.containsKey(o));
+ char[] co = (char[]) o;
+ assertTrue(cm.containsKey(co, 0, co.length));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+ assertEquals(hm.size(), cs.size());
+ assertEquals(cm.size(), cs.size());
+ cs.clear();
+ assertEquals(0, cs.size());
+ assertEquals(0, cm.size());
+ // keySet() should not allow adding new keys
+ expectThrows(UnsupportedOperationException.class, () -> {
+ cs.add("test");
+ });
+
+ cm.putAll(hm);
+ assertEquals(hm.size(), cs.size());
+ assertEquals(cm.size(), cs.size());
+
+ Iterator<Map.Entry<Object,Integer>> iter1 = cm.entrySet().iterator();
+ n=0;
+ while (iter1.hasNext()) {
+ Map.Entry<Object,Integer> entry = iter1.next();
+ Object key = entry.getKey();
+ Integer val = entry.getValue();
+ assertEquals(cm.get(key), val);
+ entry.setValue(val*100);
+ assertEquals(val*100, (int)cm.get(key));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+ cm.clear();
+ cm.putAll(hm);
+ assertEquals(cm.size(), n);
+
+ CharArrayMap<Integer>.EntryIterator iter2 = cm.entrySet().iterator();
+ n=0;
+ while (iter2.hasNext()) {
+ char[] keyc = iter2.nextKey();
+ Integer val = iter2.currentValue();
+ assertEquals(hm.get(new String(keyc)), val);
+ iter2.setValue(val*100);
+ assertEquals(val*100, (int)cm.get(keyc));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+
+ cm.entrySet().clear();
+ assertEquals(0, cm.size());
+ assertEquals(0, cm.entrySet().size());
+ assertTrue(cm.isEmpty());
+ }
+
+ // TODO: break this up into simpler test methods vs. "telling a story"
+ public void testModifyOnUnmodifiable(){
+ CharArrayMap<Integer> map = new CharArrayMap<>(2, false);
+ map.put("foo",1);
+ map.put("bar",2);
+ final int size = map.size();
+ assertEquals(2, size);
+ assertTrue(map.containsKey("foo"));
+ assertEquals(1, map.get("foo").intValue());
+ assertTrue(map.containsKey("bar"));
+ assertEquals(2, map.get("bar").intValue());
+
+ map = CharArrayMap.unmodifiableMap(map);
+ assertEquals("Map size changed due to unmodifiableMap call" , size, map.size());
+ String NOT_IN_MAP = "SirGallahad";
+ assertFalse("Test String already exists in map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String already exists in map", map.get(NOT_IN_MAP));
+
+ try{
+ map.put(NOT_IN_MAP.toCharArray(), 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put(NOT_IN_MAP, 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put(new StringBuilder(NOT_IN_MAP), 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.entrySet().clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.keySet().clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put((Object) NOT_IN_MAP, 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.putAll(Collections.singletonMap(NOT_IN_MAP, 3));
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ assertTrue(map.containsKey("foo"));
+ assertEquals(1, map.get("foo").intValue());
+ assertTrue(map.containsKey("bar"));
+ assertEquals(2, map.get("bar").intValue());
+ }
+
+ public void testToString() {
+ CharArrayMap<Integer> cm = new CharArrayMap<>(Collections.singletonMap("test",1), false);
+ assertEquals("[test]",cm.keySet().toString());
+ assertEquals("[1]",cm.values().toString());
+ assertEquals("[test=1]",cm.entrySet().toString());
+ assertEquals("{test=1}",cm.toString());
+ cm.put("test2", 2);
+ assertTrue(cm.keySet().toString().contains(", "));
+ assertTrue(cm.values().toString().contains(", "));
+ assertTrue(cm.entrySet().toString().contains(", "));
+ assertTrue(cm.toString().contains(", "));
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
new file mode 100644
index 0000000..465f512
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestCharArraySet extends LuceneTestCase {
+
+ static final String[] TEST_STOP_WORDS = {
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+
+ public void testRehash() throws Exception {
+ CharArraySet cas = new CharArraySet(0, true);
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ cas.add(TEST_STOP_WORDS[i]);
+ assertEquals(TEST_STOP_WORDS.length, cas.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertTrue(cas.contains(TEST_STOP_WORDS[i]));
+ }
+
+ public void testNonZeroOffset() {
+ String[] words={"Hello","World","this","is","a","test"};
+ char[] findme="xthisy".toCharArray();
+ CharArraySet set= new CharArraySet(10, true);
+ set.addAll(Arrays.asList(words));
+ assertTrue(set.contains(findme, 1, 4));
+ assertTrue(set.contains(new String(findme,1,4)));
+
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(findme, 1, 4));
+ assertTrue(set.contains(new String(findme,1,4)));
+ }
+
+ public void testObjectContains() {
+ CharArraySet set = new CharArraySet(10, true);
+ Integer val = Integer.valueOf(1);
+ set.add(val);
+ assertTrue(set.contains(val));
+ assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(val));
+ assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+ }
+
+ public void testClear(){
+ CharArraySet set=new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ set.clear();
+ assertEquals("not empty", 0, set.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertFalse(set.contains(TEST_STOP_WORDS[i]));
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertTrue(set.contains(TEST_STOP_WORDS[i]));
+ }
+
+ // TODO: break this up into simpler test methods, vs "telling a story"
+ public void testModifyOnUnmodifiable(){
+ CharArraySet set=new CharArraySet(10, true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+ String NOT_IN_SET = "SirGallahad";
+ assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
+
+ try{
+ set.add(NOT_IN_SET.toCharArray());
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(new StringBuilder(NOT_IN_SET));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.clear();
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+ try{
+ set.add((Object) NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ // This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
+ // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
+ // remove() on the iterator
+ try{
+ set.removeAll(new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.retainAll(new CharArraySet(Arrays.asList(NOT_IN_SET), true));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.addAll(Arrays.asList(NOT_IN_SET));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ }
+
+ for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
+ assertTrue(set.contains(TEST_STOP_WORDS[i]));
+ }
+ }
+
+ public void testUnmodifiableSet(){
+ CharArraySet set = new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ set.add(Integer.valueOf(1));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+ for (String stopword : TEST_STOP_WORDS) {
+ assertTrue(set.contains(stopword));
+ }
+ assertTrue(set.contains(Integer.valueOf(1)));
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+
+ expectThrows(NullPointerException.class, () -> {
+ CharArraySet.unmodifiableSet(null);
+ });
+ }
+
+ public void testSupplementaryChars() {
+ String missing = "Term %s is missing in the set";
+ String falsePos = "Term %s is in the set but shouldn't";
+ // for reference see
+ // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
+ String[] upperArr = new String[] {"Abc\ud801\udc1c",
+ "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
+ String[] lowerArr = new String[] {"abc\ud801\udc44",
+ "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
+ CharArraySet set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), false);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ }
+
+ public void testSingleHighSurrogate() {
+ String missing = "Term %s is missing in the set";
+ String falsePos = "Term %s is in the set but shouldn't";
+ String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
+ "\uD800EfG", "\uD800\ud801\udc1cB" };
+
+ String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
+ "\uD800efg", "\uD800\ud801\udc44b" };
+ CharArraySet set = new CharArraySet(Arrays
+ .asList(TEST_STOP_WORDS), true);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS),
+ false);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertFalse(String.format(Locale.ROOT, falsePos, upperArr[i]), set
+ .contains(lowerArr[i]));
+ }
+ }
+
+ @SuppressWarnings("deprecated")
+ public void testCopyCharArraySetBWCompat() {
+ CharArraySet setIngoreCase = new CharArraySet(10, true);
+ CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setIngoreCase.add(Integer.valueOf(1));
+ setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setCaseSensitive.add(Integer.valueOf(1));
+
+ CharArraySet copy = CharArraySet.copy(setIngoreCase);
+ CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+ assertEquals(setIngoreCase.size(), copy.size());
+ assertEquals(setCaseSensitive.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copyCaseSens.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copyCaseSens.contains(string));
+ }
+ // test adding terms to the copy
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(setIngoreCase.contains(string));
+ assertFalse(setCaseSensitive.contains(string));
+
+ }
+ }
+
+ /**
+ * Test the static #copy() function with a CharArraySet as a source
+ */
+ public void testCopyCharArraySet() {
+ CharArraySet setIngoreCase = new CharArraySet(10, true);
+ CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setIngoreCase.add(Integer.valueOf(1));
+ setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setCaseSensitive.add(Integer.valueOf(1));
+
+ CharArraySet copy = CharArraySet.copy(setIngoreCase);
+ CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+ assertEquals(setIngoreCase.size(), copy.size());
+ assertEquals(setCaseSensitive.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copyCaseSens.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copyCaseSens.contains(string));
+ }
+ // test adding terms to the copy
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(setIngoreCase.contains(string));
+ assertFalse(setCaseSensitive.contains(string));
+
+ }
+ }
+
+ /**
+ * Test the static #copy() function with a JDK {@link Set} as a source
+ */
+ public void testCopyJDKSet() {
+ Set<String> set = new HashSet<>();
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+
+ CharArraySet copy = CharArraySet.copy(set);
+
+ assertEquals(set.size(), copy.size());
+ assertEquals(set.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copy.contains(string));
+ }
+
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(set.contains(string));
+ }
+ }
+
+ /**
+ * Tests a special case of {@link CharArraySet#copy(Set)} where the
+ * set to copy is the {@link CharArraySet#EMPTY_SET}
+ */
+ public void testCopyEmptySet() {
+ assertSame(CharArraySet.EMPTY_SET,
+ CharArraySet.copy(CharArraySet.EMPTY_SET));
+ }
+
+ /**
+ * Smoketests the static empty set
+ */
+ public void testEmptySet() {
+ assertEquals(0, CharArraySet.EMPTY_SET.size());
+
+ assertTrue(CharArraySet.EMPTY_SET.isEmpty());
+ for (String stopword : TEST_STOP_WORDS) {
+ assertFalse(CharArraySet.EMPTY_SET.contains(stopword));
+ }
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo"));
+ assertFalse(CharArraySet.EMPTY_SET.contains((Object) "foo"));
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray()));
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray(),0,3));
+ }
+
+ /**
+ * Test for NPE
+ */
+ public void testContainsWithNull() {
+ CharArraySet set = new CharArraySet(1, true);
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((char[]) null, 0, 10);
+ });
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((CharSequence) null);
+ });
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((Object) null);
+ });
+ }
+
+ public void testToString() {
+ CharArraySet set = CharArraySet.copy(Collections.singleton("test"));
+ assertEquals("[test]", set.toString());
+ set.add("test2");
+ assertTrue(set.toString().contains(", "));
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
new file mode 100644
index 0000000..53b3f56
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.Test;
+
+/**
+ * TestCase for the {@link CharacterUtils} class.
+ */
+public class TestCharacterUtils extends LuceneTestCase {
+
+ public void testConversions() {
+ final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
+ final int[] buf = new int[orig.length];
+ final char[] restored = new char[buf.length];
+ final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
+ final int o2 = TestUtil.nextInt(random(), 0, o1);
+ final int o3 = TestUtil.nextInt(random(), 0, o1);
+ final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+ final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
+ assertEquals(orig.length - o1, charCount);
+ assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
+ }
+
+ @Test
+ public void testNewCharacterBuffer() {
+ CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024);
+ assertEquals(1024, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ newCharacterBuffer = CharacterUtils.newCharacterBuffer(2);
+ assertEquals(2, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ // length must be >= 2
+ expectThrows(IllegalArgumentException.class, () -> {
+ CharacterUtils.newCharacterBuffer(1);
+ });
+ }
+
+ @Test
+ public void testFillNoHighSurrogate() throws IOException {
+ Reader reader = new StringReader("helloworld");
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+ assertTrue(CharacterUtils.fill(buffer,reader));
+ assertEquals(0, buffer.getOffset());
+ assertEquals(6, buffer.getLength());
+ assertEquals("hellow", new String(buffer.getBuffer()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals(0, buffer.getOffset());
+
+ assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
+ }
+
+ @Test
+ public void testFill() throws IOException {
+ String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
+ Reader reader = new StringReader(input);
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(5, buffer.getLength());
+ assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("123\ud801", new String(buffer.getBuffer(),
+ buffer.getOffset(), buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer, reader));
+ assertEquals(3, buffer.getLength());
+ assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer, reader));
+ assertEquals(0, buffer.getLength());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
new file mode 100644
index 0000000..c224682
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.English;
+
+public class TestStopFilter extends BaseTokenStreamTestCase {
+
+ // other StopFilter functionality is already tested by TestStopAnalyzer
+
+ public void testExactCase() throws IOException {
+ StringReader reader = new StringReader("Now is The Time");
+ CharArraySet stopWords = new CharArraySet(asSet("is", "the", "Time"), false);
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ TokenStream stream = new StopFilter(in, stopWords);
+ assertTokenStreamContents(stream, new String[] { "Now", "The" });
+ }
+
+ public void testStopFilt() throws IOException {
+ StringReader reader = new StringReader("Now is The Time");
+ String[] stopWords = new String[] { "is", "the", "Time" };
+ CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ TokenStream stream = new StopFilter(in, stopSet);
+ assertTokenStreamContents(stream, new String[] { "Now", "The" });
+ }
+
+ /**
+ * Test Position increments applied by StopFilter with and without enabling this option.
+ */
+ public void testStopPositons() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ ArrayList<String> a = new ArrayList<>();
+ for (int i=0; i<20; i++) {
+ String w = English.intToEnglish(i).trim();
+ sb.append(w).append(" ");
+ if (i%3 != 0) a.add(w);
+ }
+ log(sb.toString());
+ String stopWords[] = a.toArray(new String[0]);
+ for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
+ CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+ // with increments
+ StringReader reader = new StringReader(sb.toString());
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ StopFilter stpf = new StopFilter(in, stopSet);
+ doTestStopPositons(stpf);
+ // with increments, concatenating two stop filters
+ ArrayList<String> a0 = new ArrayList<>();
+ ArrayList<String> a1 = new ArrayList<>();
+ for (int i=0; i<a.size(); i++) {
+ if (i%2==0) {
+ a0.add(a.get(i));
+ } else {
+ a1.add(a.get(i));
+ }
+ }
+ String stopWords0[] = a0.toArray(new String[0]);
+ for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
+ String stopWords1[] = a1.toArray(new String[0]);
+ for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
+ CharArraySet stopSet0 = StopFilter.makeStopSet(stopWords0);
+ CharArraySet stopSet1 = StopFilter.makeStopSet(stopWords1);
+ reader = new StringReader(sb.toString());
+ final MockTokenizer in1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in1.setReader(reader);
+ StopFilter stpf0 = new StopFilter(in1, stopSet0); // first part of the set
+ StopFilter stpf01 = new StopFilter(stpf0, stopSet1); // two stop filters concatenated!
+ doTestStopPositons(stpf01);
+ }
+
+ // LUCENE-3849: make sure after .end() we see the "ending" posInc
+ public void testEndStopword() throws Exception {
+ CharArraySet stopSet = StopFilter.makeStopSet("of");
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(new StringReader("test of"));
+ StopFilter stpf = new StopFilter(in, stopSet);
+ assertTokenStreamContents(stpf, new String[] { "test" },
+ new int[] {0},
+ new int[] {4},
+ null,
+ new int[] {1},
+ null,
+ 7,
+ 1,
+ null,
+ true);
+ }
+
+ private void doTestStopPositons(StopFilter stpf) throws IOException {
+ CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
+ stpf.reset();
+ for (int i=0; i<20; i+=3) {
+ assertTrue(stpf.incrementToken());
+ log("Token "+i+": "+stpf);
+ String w = English.intToEnglish(i).trim();
+ assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
+ assertEquals("all but first token must have position increment of 3",i==0?1:3,posIncrAtt.getPositionIncrement());
+ }
+ assertFalse(stpf.incrementToken());
+ stpf.end();
+ stpf.close();
+ }
+
+ // print debug info depending on VERBOSE
+ private static void log(String s) {
+ if (VERBOSE) {
+ System.out.println(s);
+ }
+ }
+
+ // stupid filter that inserts synonym of 'hte' for 'the'
+ private class MockSynonymFilter extends TokenFilter {
+ State bufferedState;
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ MockSynonymFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (bufferedState != null) {
+ restoreState(bufferedState);
+ posIncAtt.setPositionIncrement(0);
+ termAtt.setEmpty().append("hte");
+ bufferedState = null;
+ return true;
+ } else if (input.incrementToken()) {
+ if (termAtt.toString().equals("the")) {
+ bufferedState = captureState();
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ bufferedState = null;
+ }
+ }
+
+}
[06/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
deleted file mode 100644
index ec37924..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ /dev/null
@@ -1,232 +0,0 @@
-#!/usr/bin/perl
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-use warnings;
-use strict;
-use File::Spec;
-use Getopt::Long;
-use LWP::UserAgent;
-
-my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
-
-my $version = '';
-unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
- print STDERR "Usage: $script_name -v <version>\n";
- print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
- if ($version);
- exit 1;
-}
-my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
-my $scripts_url = "${url_prefix}/Scripts.txt";
-my $line_break_url = "${url_prefix}/LineBreak.txt";
-my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
-my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
-my $underscore_version = $version;
-$underscore_version =~ s/\./_/g;
-my $class_name = "WordBreakTestUnicode_${underscore_version}";
-my $output_filename = "${class_name}.java";
-my $header =<<"__HEADER__";
-package org.apache.lucene.analysis.core;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
-
-/**
- * This class was automatically generated by ${script_name}
- * from: ${url_prefix}/auxiliary/WordBreakTest.txt
- *
- * WordBreakTest.txt indicates the points in the provided character sequences
- * at which conforming implementations must and must not break words. This
- * class tests for expected token extraction from each of the test sequences
- * in WordBreakTest.txt, where the expected tokens are those character
- * sequences bounded by word breaks and containing at least one character
- * from one of the following character sets:
- *
- * \\p{Script = Han} (From $scripts_url)
- * \\p{Script = Hiragana}
- * \\p{LineBreak = Complex_Context} (From $line_break_url)
- * \\p{WordBreak = ALetter} (From $word_break_url)
- * \\p{WordBreak = Hebrew_Letter}
- * \\p{WordBreak = Katakana}
- * \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
- * [\\uFF10-\\uFF19] (Full-width Arabic digits)
- */
-\@Ignore
-public class ${class_name} extends BaseTokenStreamTestCase {
-
- public void test(Analyzer analyzer) throws Exception {
-__HEADER__
-
-my $codepoints = [];
-map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
-# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
-# Using lowercase versions of property value names to allow for case-
-# insensitive comparison with the names in the Unicode data files.
-parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
-parse_Unicode_data_file($scripts_url, $codepoints,
- {'han' => 1, 'hiragana' => 1});
-parse_Unicode_data_file($word_break_url, $codepoints,
- {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
-my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
-
-my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
-open OUT, ">$output_path"
- || die "Error opening '$output_path' for writing: $!";
-
-print STDERR "Writing '$output_path'...";
-
-print OUT $header;
-
-for my $line (@tests) {
- next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
- # Example line: � 0001 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- my ($sequence) = $line =~ /^(.*?)\s*\#/;
- $line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
- print OUT " // $line\n";
- $sequence =~ s/\s*�\s*$//; # Trim trailing break character
- my $test_string = $sequence;
- $test_string =~ s/\s*�\s*/\\u/g;
- $test_string =~ s/\s*�\s*/\\u/g;
- $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
- $test_string =~ s/\\u000A/\\n/g;
- $test_string =~ s/\\u000D/\\r/g;
- $test_string =~ s/\\u0022/\\\"/g;
- $sequence =~ s/^\s*�\s*//; # Trim leading break character
- my @tokens = ();
- for my $candidate (split /\s*�\s*/, $sequence) {
- my @chars = ();
- my $has_wanted_char = 0;
- while ($candidate =~ /([0-9A-F]+)/gi) {
- my $hexchar = $1;
- if (4 == length($hexchar)) {
- push @chars, $hexchar;
- } else {
- push @chars, above_BMP_char_to_surrogates($hexchar);
- }
- unless ($has_wanted_char) {
- $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
- }
- }
- if ($has_wanted_char) {
- push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
- }
- }
- print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
- print OUT " new String[] { ";
- print OUT join(", ", @tokens), " });\n\n";
-}
-
-print OUT " }\n}\n";
-close OUT;
-print STDERR "done.\n";
-
-
-# sub above_BMP_char_to_surrogates
-#
-# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
-# to the corresponding UTF-16 surrogate pair
-#
-# Assumption: input string is a sequence more than four hex digits
-#
-sub above_BMP_char_to_surrogates {
- my $ch = hex(shift);
- my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
- my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
- return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
-}
-
-
-# sub parse_Unicode_data_file
-#
-# Downloads and parses the specified Unicode data file, parses it, and
-# extracts code points assigned any of the given property values, defining
-# the corresponding array position in the passed-in target array.
-#
-# Takes in the following parameters:
-#
-# - URL of the Unicode data file to download and parse
-# - Reference to target array
-# - Reference to hash of property values to get code points for
-#
-sub parse_Unicode_data_file {
- my $url = shift;
- my $target = shift;
- my $wanted_property_values = shift;
- my $content = get_URL_content($url);
- print STDERR "Parsing '$url'...";
- my @lines = split /\r?\n/, $content;
- for (@lines) {
- s/\s*#.*//; # Strip trailing comments
- s/\s+$//; # Strip trailing space
- next unless (/\S/); # Skip empty lines
- my ($start, $end, $property_value);
- if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
- # 00AA ; LATIN
- $start = $end = hex $1;
- $property_value = lc $2; # Property value names are case-insensitive
- } elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
- # 0AE6..0AEF ; Gujarati
- $start = hex $1;
- $end = hex $2;
- $property_value = lc $3; # Property value names are case-insensitive
- } else {
- next;
- }
- if (defined($wanted_property_values->{$property_value})) {
- for my $code_point ($start..$end) {
- $target->[$code_point] = 1;
- }
- }
- }
- print STDERR "done.\n";
-}
-
-# sub get_URL_content
-#
-# Retrieves and returns the content of the given URL.
-#
-sub get_URL_content {
- my $url = shift;
- print STDERR "Retrieving '$url'...";
- my $user_agent = LWP::UserAgent->new;
- my $request = HTTP::Request->new(GET => $url);
- my $response = $user_agent->request($request);
- unless ($response->is_success) {
- print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
- exit 1;
- }
- print STDERR "done.\n";
- return $response->content;
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
index b9d586e..b7f45cb 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
index ebe1034..fd7aefd 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
index c4b9276..580e269 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@@ -19,10 +19,10 @@ package org.apache.lucene.analysis.th;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
index 9972702..9cfc6fc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayMap.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayMap.java
deleted file mode 100644
index 66b0dce..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayMap.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-import java.util.*;
-
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestCharArrayMap extends LuceneTestCase {
- public void doRandom(int iter, boolean ignoreCase) {
- CharArrayMap<Integer> map = new CharArrayMap<>(1, ignoreCase);
- HashMap<String,Integer> hmap = new HashMap<>();
-
- char[] key;
- for (int i=0; i<iter; i++) {
- int len = random().nextInt(5);
- key = new char[len];
- for (int j=0; j<key.length; j++) {
- key[j] = (char)random().nextInt(127);
- }
- String keyStr = new String(key);
- String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr;
-
- int val = random().nextInt();
-
- Object o1 = map.put(key, val);
- Object o2 = hmap.put(hmapKey,val);
- assertEquals(o1,o2);
-
- // add it again with the string method
- assertEquals(val, map.put(keyStr,val).intValue());
-
- assertEquals(val, map.get(key,0,key.length).intValue());
- assertEquals(val, map.get(key).intValue());
- assertEquals(val, map.get(keyStr).intValue());
-
- assertEquals(hmap.size(), map.size());
- }
- }
-
- public void testCharArrayMap() {
- int num = 5 * RANDOM_MULTIPLIER;
- for (int i = 0; i < num; i++) { // pump this up for more random testing
- doRandom(1000,false);
- doRandom(1000,true);
- }
- }
-
- public void testMethods() {
- CharArrayMap<Integer> cm = new CharArrayMap<>(2, false);
- HashMap<String,Integer> hm = new HashMap<>();
- hm.put("foo",1);
- hm.put("bar",2);
- cm.putAll(hm);
- assertEquals(hm.size(), cm.size());
- hm.put("baz", 3);
- cm.putAll(hm);
- assertEquals(hm.size(), cm.size());
-
- CharArraySet cs = cm.keySet();
- int n=0;
- for (Object o : cs) {
- assertTrue(cm.containsKey(o));
- char[] co = (char[]) o;
- assertTrue(cm.containsKey(co, 0, co.length));
- n++;
- }
- assertEquals(hm.size(), n);
- assertEquals(hm.size(), cs.size());
- assertEquals(cm.size(), cs.size());
- cs.clear();
- assertEquals(0, cs.size());
- assertEquals(0, cm.size());
- // keySet() should not allow adding new keys
- expectThrows(UnsupportedOperationException.class, () -> {
- cs.add("test");
- });
-
- cm.putAll(hm);
- assertEquals(hm.size(), cs.size());
- assertEquals(cm.size(), cs.size());
-
- Iterator<Map.Entry<Object,Integer>> iter1 = cm.entrySet().iterator();
- n=0;
- while (iter1.hasNext()) {
- Map.Entry<Object,Integer> entry = iter1.next();
- Object key = entry.getKey();
- Integer val = entry.getValue();
- assertEquals(cm.get(key), val);
- entry.setValue(val*100);
- assertEquals(val*100, (int)cm.get(key));
- n++;
- }
- assertEquals(hm.size(), n);
- cm.clear();
- cm.putAll(hm);
- assertEquals(cm.size(), n);
-
- CharArrayMap<Integer>.EntryIterator iter2 = cm.entrySet().iterator();
- n=0;
- while (iter2.hasNext()) {
- char[] keyc = iter2.nextKey();
- Integer val = iter2.currentValue();
- assertEquals(hm.get(new String(keyc)), val);
- iter2.setValue(val*100);
- assertEquals(val*100, (int)cm.get(keyc));
- n++;
- }
- assertEquals(hm.size(), n);
-
- cm.entrySet().clear();
- assertEquals(0, cm.size());
- assertEquals(0, cm.entrySet().size());
- assertTrue(cm.isEmpty());
- }
-
- // TODO: break this up into simpler test methods vs. "telling a story"
- public void testModifyOnUnmodifiable(){
- CharArrayMap<Integer> map = new CharArrayMap<>(2, false);
- map.put("foo",1);
- map.put("bar",2);
- final int size = map.size();
- assertEquals(2, size);
- assertTrue(map.containsKey("foo"));
- assertEquals(1, map.get("foo").intValue());
- assertTrue(map.containsKey("bar"));
- assertEquals(2, map.get("bar").intValue());
-
- map = CharArrayMap.unmodifiableMap(map);
- assertEquals("Map size changed due to unmodifiableMap call" , size, map.size());
- String NOT_IN_MAP = "SirGallahad";
- assertFalse("Test String already exists in map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String already exists in map", map.get(NOT_IN_MAP));
-
- try{
- map.put(NOT_IN_MAP.toCharArray(), 3);
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.put(NOT_IN_MAP, 3);
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.put(new StringBuilder(NOT_IN_MAP), 3);
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.clear();
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.entrySet().clear();
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.keySet().clear();
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.put((Object) NOT_IN_MAP, 3);
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- try{
- map.putAll(Collections.singletonMap(NOT_IN_MAP, 3));
- fail("Modified unmodifiable map");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
- assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
- assertEquals("Size of unmodifiable map has changed", size, map.size());
- }
-
- assertTrue(map.containsKey("foo"));
- assertEquals(1, map.get("foo").intValue());
- assertTrue(map.containsKey("bar"));
- assertEquals(2, map.get("bar").intValue());
- }
-
- public void testToString() {
- CharArrayMap<Integer> cm = new CharArrayMap<>(Collections.singletonMap("test",1), false);
- assertEquals("[test]",cm.keySet().toString());
- assertEquals("[1]",cm.values().toString());
- assertEquals("[test=1]",cm.entrySet().toString());
- assertEquals("{test=1}",cm.toString());
- cm.put("test2", 2);
- assertTrue(cm.keySet().toString().contains(", "));
- assertTrue(cm.values().toString().contains(", "));
- assertTrue(cm.entrySet().toString().contains(", "));
- assertTrue(cm.toString().contains(", "));
- }
-}
-
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
deleted file mode 100644
index 1fcee65..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.util.*;
-
-import org.apache.lucene.util.LuceneTestCase;
-
-
-public class TestCharArraySet extends LuceneTestCase {
-
- static final String[] TEST_STOP_WORDS = {
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it",
- "no", "not", "of", "on", "or", "such",
- "that", "the", "their", "then", "there", "these",
- "they", "this", "to", "was", "will", "with"
- };
-
-
- public void testRehash() throws Exception {
- CharArraySet cas = new CharArraySet(0, true);
- for(int i=0;i<TEST_STOP_WORDS.length;i++)
- cas.add(TEST_STOP_WORDS[i]);
- assertEquals(TEST_STOP_WORDS.length, cas.size());
- for(int i=0;i<TEST_STOP_WORDS.length;i++)
- assertTrue(cas.contains(TEST_STOP_WORDS[i]));
- }
-
- public void testNonZeroOffset() {
- String[] words={"Hello","World","this","is","a","test"};
- char[] findme="xthisy".toCharArray();
- CharArraySet set= new CharArraySet(10, true);
- set.addAll(Arrays.asList(words));
- assertTrue(set.contains(findme, 1, 4));
- assertTrue(set.contains(new String(findme,1,4)));
-
- // test unmodifiable
- set = CharArraySet.unmodifiableSet(set);
- assertTrue(set.contains(findme, 1, 4));
- assertTrue(set.contains(new String(findme,1,4)));
- }
-
- public void testObjectContains() {
- CharArraySet set = new CharArraySet(10, true);
- Integer val = Integer.valueOf(1);
- set.add(val);
- assertTrue(set.contains(val));
- assertTrue(set.contains(new Integer(1))); // another integer
- assertTrue(set.contains("1"));
- assertTrue(set.contains(new char[]{'1'}));
- // test unmodifiable
- set = CharArraySet.unmodifiableSet(set);
- assertTrue(set.contains(val));
- assertTrue(set.contains(new Integer(1))); // another integer
- assertTrue(set.contains("1"));
- assertTrue(set.contains(new char[]{'1'}));
- }
-
- public void testClear(){
- CharArraySet set=new CharArraySet(10,true);
- set.addAll(Arrays.asList(TEST_STOP_WORDS));
- assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
- set.clear();
- assertEquals("not empty", 0, set.size());
- for(int i=0;i<TEST_STOP_WORDS.length;i++)
- assertFalse(set.contains(TEST_STOP_WORDS[i]));
- set.addAll(Arrays.asList(TEST_STOP_WORDS));
- assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
- for(int i=0;i<TEST_STOP_WORDS.length;i++)
- assertTrue(set.contains(TEST_STOP_WORDS[i]));
- }
-
- // TODO: break this up into simpler test methods, vs "telling a story"
- public void testModifyOnUnmodifiable(){
- CharArraySet set=new CharArraySet(10, true);
- set.addAll(Arrays.asList(TEST_STOP_WORDS));
- final int size = set.size();
- set = CharArraySet.unmodifiableSet(set);
- assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
- String NOT_IN_SET = "SirGallahad";
- assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
-
- try{
- set.add(NOT_IN_SET.toCharArray());
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- try{
- set.add(NOT_IN_SET);
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- try{
- set.add(new StringBuilder(NOT_IN_SET));
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- try{
- set.clear();
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
- try{
- set.add((Object) NOT_IN_SET);
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- // This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
- // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
- // remove() on the iterator
- try{
- set.removeAll(new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true));
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- try{
- set.retainAll(new CharArraySet(Arrays.asList(NOT_IN_SET), true));
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertEquals("Size of unmodifiable set has changed", size, set.size());
- }
-
- try{
- set.addAll(Arrays.asList(NOT_IN_SET));
- fail("Modified unmodifiable set");
- }catch (UnsupportedOperationException e) {
- // expected
- assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
- }
-
- for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
- assertTrue(set.contains(TEST_STOP_WORDS[i]));
- }
- }
-
- public void testUnmodifiableSet(){
- CharArraySet set = new CharArraySet(10,true);
- set.addAll(Arrays.asList(TEST_STOP_WORDS));
- set.add(Integer.valueOf(1));
- final int size = set.size();
- set = CharArraySet.unmodifiableSet(set);
- assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
- for (String stopword : TEST_STOP_WORDS) {
- assertTrue(set.contains(stopword));
- }
- assertTrue(set.contains(Integer.valueOf(1)));
- assertTrue(set.contains("1"));
- assertTrue(set.contains(new char[]{'1'}));
-
- expectThrows(NullPointerException.class, () -> {
- CharArraySet.unmodifiableSet(null);
- });
- }
-
- public void testSupplementaryChars() {
- String missing = "Term %s is missing in the set";
- String falsePos = "Term %s is in the set but shouldn't";
- // for reference see
- // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
- String[] upperArr = new String[] {"Abc\ud801\udc1c",
- "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
- String[] lowerArr = new String[] {"abc\ud801\udc44",
- "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
- CharArraySet set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true);
- for (String upper : upperArr) {
- set.add(upper);
- }
- for (int i = 0; i < upperArr.length; i++) {
- assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
- assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
- }
- set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), false);
- for (String upper : upperArr) {
- set.add(upper);
- }
- for (int i = 0; i < upperArr.length; i++) {
- assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
- assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
- }
- }
-
- public void testSingleHighSurrogate() {
- String missing = "Term %s is missing in the set";
- String falsePos = "Term %s is in the set but shouldn't";
- String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
- "\uD800EfG", "\uD800\ud801\udc1cB" };
-
- String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
- "\uD800efg", "\uD800\ud801\udc44b" };
- CharArraySet set = new CharArraySet(Arrays
- .asList(TEST_STOP_WORDS), true);
- for (String upper : upperArr) {
- set.add(upper);
- }
- for (int i = 0; i < upperArr.length; i++) {
- assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
- assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
- }
- set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS),
- false);
- for (String upper : upperArr) {
- set.add(upper);
- }
- for (int i = 0; i < upperArr.length; i++) {
- assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
- assertFalse(String.format(Locale.ROOT, falsePos, upperArr[i]), set
- .contains(lowerArr[i]));
- }
- }
-
- @SuppressWarnings("deprecated")
- public void testCopyCharArraySetBWCompat() {
- CharArraySet setIngoreCase = new CharArraySet(10, true);
- CharArraySet setCaseSensitive = new CharArraySet(10, false);
-
- List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
- List<String> stopwordsUpper = new ArrayList<>();
- for (String string : stopwords) {
- stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
- }
- setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
- setIngoreCase.add(Integer.valueOf(1));
- setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
- setCaseSensitive.add(Integer.valueOf(1));
-
- CharArraySet copy = CharArraySet.copy(setIngoreCase);
- CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
-
- assertEquals(setIngoreCase.size(), copy.size());
- assertEquals(setCaseSensitive.size(), copy.size());
-
- assertTrue(copy.containsAll(stopwords));
- assertTrue(copy.containsAll(stopwordsUpper));
- assertTrue(copyCaseSens.containsAll(stopwords));
- for (String string : stopwordsUpper) {
- assertFalse(copyCaseSens.contains(string));
- }
- // test adding terms to the copy
- List<String> newWords = new ArrayList<>();
- for (String string : stopwords) {
- newWords.add(string+"_1");
- }
- copy.addAll(newWords);
-
- assertTrue(copy.containsAll(stopwords));
- assertTrue(copy.containsAll(stopwordsUpper));
- assertTrue(copy.containsAll(newWords));
- // new added terms are not in the source set
- for (String string : newWords) {
- assertFalse(setIngoreCase.contains(string));
- assertFalse(setCaseSensitive.contains(string));
-
- }
- }
-
- /**
- * Test the static #copy() function with a CharArraySet as a source
- */
- public void testCopyCharArraySet() {
- CharArraySet setIngoreCase = new CharArraySet(10, true);
- CharArraySet setCaseSensitive = new CharArraySet(10, false);
-
- List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
- List<String> stopwordsUpper = new ArrayList<>();
- for (String string : stopwords) {
- stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
- }
- setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
- setIngoreCase.add(Integer.valueOf(1));
- setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
- setCaseSensitive.add(Integer.valueOf(1));
-
- CharArraySet copy = CharArraySet.copy(setIngoreCase);
- CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
-
- assertEquals(setIngoreCase.size(), copy.size());
- assertEquals(setCaseSensitive.size(), copy.size());
-
- assertTrue(copy.containsAll(stopwords));
- assertTrue(copy.containsAll(stopwordsUpper));
- assertTrue(copyCaseSens.containsAll(stopwords));
- for (String string : stopwordsUpper) {
- assertFalse(copyCaseSens.contains(string));
- }
- // test adding terms to the copy
- List<String> newWords = new ArrayList<>();
- for (String string : stopwords) {
- newWords.add(string+"_1");
- }
- copy.addAll(newWords);
-
- assertTrue(copy.containsAll(stopwords));
- assertTrue(copy.containsAll(stopwordsUpper));
- assertTrue(copy.containsAll(newWords));
- // new added terms are not in the source set
- for (String string : newWords) {
- assertFalse(setIngoreCase.contains(string));
- assertFalse(setCaseSensitive.contains(string));
-
- }
- }
-
- /**
- * Test the static #copy() function with a JDK {@link Set} as a source
- */
- public void testCopyJDKSet() {
- Set<String> set = new HashSet<>();
-
- List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
- List<String> stopwordsUpper = new ArrayList<>();
- for (String string : stopwords) {
- stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
- }
- set.addAll(Arrays.asList(TEST_STOP_WORDS));
-
- CharArraySet copy = CharArraySet.copy(set);
-
- assertEquals(set.size(), copy.size());
- assertEquals(set.size(), copy.size());
-
- assertTrue(copy.containsAll(stopwords));
- for (String string : stopwordsUpper) {
- assertFalse(copy.contains(string));
- }
-
- List<String> newWords = new ArrayList<>();
- for (String string : stopwords) {
- newWords.add(string+"_1");
- }
- copy.addAll(newWords);
-
- assertTrue(copy.containsAll(stopwords));
- assertTrue(copy.containsAll(newWords));
- // new added terms are not in the source set
- for (String string : newWords) {
- assertFalse(set.contains(string));
- }
- }
-
- /**
- * Tests a special case of {@link CharArraySet#copy(Set)} where the
- * set to copy is the {@link CharArraySet#EMPTY_SET}
- */
- public void testCopyEmptySet() {
- assertSame(CharArraySet.EMPTY_SET,
- CharArraySet.copy(CharArraySet.EMPTY_SET));
- }
-
- /**
- * Smoketests the static empty set
- */
- public void testEmptySet() {
- assertEquals(0, CharArraySet.EMPTY_SET.size());
-
- assertTrue(CharArraySet.EMPTY_SET.isEmpty());
- for (String stopword : TEST_STOP_WORDS) {
- assertFalse(CharArraySet.EMPTY_SET.contains(stopword));
- }
- assertFalse(CharArraySet.EMPTY_SET.contains("foo"));
- assertFalse(CharArraySet.EMPTY_SET.contains((Object) "foo"));
- assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray()));
- assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray(),0,3));
- }
-
- /**
- * Test for NPE
- */
- public void testContainsWithNull() {
- CharArraySet set = new CharArraySet(1, true);
-
- expectThrows(NullPointerException.class, () -> {
- set.contains((char[]) null, 0, 10);
- });
-
- expectThrows(NullPointerException.class, () -> {
- set.contains((CharSequence) null);
- });
-
- expectThrows(NullPointerException.class, () -> {
- set.contains((Object) null);
- });
- }
-
- public void testToString() {
- CharArraySet set = CharArraySet.copy(Collections.singleton("test"));
- assertEquals("[test]", set.toString());
- set.add("test2");
- assertTrue(set.toString().contains(", "));
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
deleted file mode 100644
index 04e96ea..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Arrays;
-
-import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-import org.junit.Test;
-
-/**
- * TestCase for the {@link CharacterUtils} class.
- */
-public class TestCharacterUtils extends LuceneTestCase {
-
- public void testConversions() {
- final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
- final int[] buf = new int[orig.length];
- final char[] restored = new char[buf.length];
- final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
- final int o2 = TestUtil.nextInt(random(), 0, o1);
- final int o3 = TestUtil.nextInt(random(), 0, o1);
- final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
- final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
- assertEquals(orig.length - o1, charCount);
- assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
- }
-
- @Test
- public void testNewCharacterBuffer() {
- CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024);
- assertEquals(1024, newCharacterBuffer.getBuffer().length);
- assertEquals(0, newCharacterBuffer.getOffset());
- assertEquals(0, newCharacterBuffer.getLength());
-
- newCharacterBuffer = CharacterUtils.newCharacterBuffer(2);
- assertEquals(2, newCharacterBuffer.getBuffer().length);
- assertEquals(0, newCharacterBuffer.getOffset());
- assertEquals(0, newCharacterBuffer.getLength());
-
- // length must be >= 2
- expectThrows(IllegalArgumentException.class, () -> {
- CharacterUtils.newCharacterBuffer(1);
- });
- }
-
- @Test
- public void testFillNoHighSurrogate() throws IOException {
- Reader reader = new StringReader("helloworld");
- CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
- assertTrue(CharacterUtils.fill(buffer,reader));
- assertEquals(0, buffer.getOffset());
- assertEquals(6, buffer.getLength());
- assertEquals("hellow", new String(buffer.getBuffer()));
- assertFalse(CharacterUtils.fill(buffer,reader));
- assertEquals(4, buffer.getLength());
- assertEquals(0, buffer.getOffset());
-
- assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
- buffer.getLength()));
- assertFalse(CharacterUtils.fill(buffer,reader));
- }
-
- @Test
- public void testFill() throws IOException {
- String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
- Reader reader = new StringReader(input);
- CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
- assertTrue(CharacterUtils.fill(buffer, reader));
- assertEquals(4, buffer.getLength());
- assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
- buffer.getLength()));
- assertTrue(CharacterUtils.fill(buffer, reader));
- assertEquals(5, buffer.getLength());
- assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
- assertTrue(CharacterUtils.fill(buffer, reader));
- assertEquals(4, buffer.getLength());
- assertEquals("123\ud801", new String(buffer.getBuffer(),
- buffer.getOffset(), buffer.getLength()));
- assertFalse(CharacterUtils.fill(buffer, reader));
- assertEquals(3, buffer.getLength());
- assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
- .getOffset(), buffer.getLength()));
- assertFalse(CharacterUtils.fill(buffer, reader));
- assertEquals(0, buffer.getLength());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java
index 5e1d3c1..f8c1198 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java
@@ -24,13 +24,13 @@ import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
*
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
index be90611..eaa6174 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
@@ -24,6 +24,8 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
deleted file mode 100644
index b1dd1b5..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.util;
-
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.util.LuceneTestCase;
-
-import org.apache.lucene.analysis.util.WordlistLoader;
-
-public class TestWordlistLoader extends LuceneTestCase {
-
- public void testWordlistLoading() throws IOException {
- String s = "ONE\n two \nthree";
- CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
- checkSet(wordSet1);
- CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
- checkSet(wordSet2);
- }
-
- public void testComments() throws Exception {
- String s = "ONE\n two \nthree\n#comment";
- CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
- checkSet(wordSet1);
- assertFalse(wordSet1.contains("#comment"));
- assertFalse(wordSet1.contains("comment"));
- }
-
-
- private void checkSet(CharArraySet wordset) {
- assertEquals(3, wordset.size());
- assertTrue(wordset.contains("ONE")); // case is not modified
- assertTrue(wordset.contains("two")); // surrounding whitespace is removed
- assertTrue(wordset.contains("three"));
- assertFalse(wordset.contains("four"));
- }
-
- /**
- * Test stopwords in snowball format
- */
- public void testSnowballListLoading() throws IOException {
- String s =
- "|comment\n" + // commented line
- " |comment\n" + // commented line with leading whitespace
- "\n" + // blank line
- " \t\n" + // line with only whitespace
- " |comment | comment\n" + // commented line with comment
- "ONE\n" + // stopword, in uppercase
- " two \n" + // stopword with leading/trailing space
- " three four five \n" + // multiple stopwords
- "six seven | comment\n"; //multiple stopwords + comment
- CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
- assertEquals(7, wordset.size());
- assertTrue(wordset.contains("ONE"));
- assertTrue(wordset.contains("two"));
- assertTrue(wordset.contains("three"));
- assertTrue(wordset.contains("four"));
- assertTrue(wordset.contains("five"));
- assertTrue(wordset.contains("six"));
- assertTrue(wordset.contains("seven"));
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/icu/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index abb2e2a..bdace97 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -103,7 +103,7 @@ algorithm.
</li>
<li>
Effective Locale-specific normalization (case differences, diacritics, etc.).
- ({@link org.apache.lucene.analysis.core.LowerCaseFilter} and
+ ({@link org.apache.lucene.analysis.LowerCaseFilter} and
{@link org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter} provide these services
in a generic way that doesn't take into account locale-specific needs.)
</li>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
index f2fd50a..17ea967 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index bff30f1..46d40b1 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -21,15 +21,15 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* Analyzer for Japanese that uses morphological analysis.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
index b8d0a78..a1af95e 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
@@ -44,7 +44,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
* input tokens tokens \uff13 and \uff12\u5343 and give outputs 3 and 2000 instead of 3200, which is
* likely not the intended result. If you want to remove punctuation characters from your
* index that are not part of normalized numbers, add a
- * {@link org.apache.lucene.analysis.core.StopFilter} with the punctuation you wish to
+ * {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to
* remove after {@link JapaneseNumberFilter} in your analyzer chain.
* <p>
* Below are some examples of normalizations this filter supports. The input is untokenized
@@ -615,4 +615,4 @@ public class JapaneseNumberFilter extends TokenFilter {
return position;
}
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
index 0ee9ccf..342295d 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
@@ -19,9 +19,9 @@ package org.apache.lucene.analysis.ja;
import java.util.Set;
-import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
/**
* Removes tokens that match a set of part-of-speech tags.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
index 8b5483c..a59de44 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
@@ -22,8 +22,8 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
index b9ebd36..ab6c473 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
@@ -21,11 +21,11 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
index bd14be3..bc57f56 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
@@ -17,16 +17,16 @@
package org.apache.lucene.analysis.ja;
+import java.io.IOException;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-
-import java.io.IOException;
/**
* Tests for {@link JapaneseKatakanaStemFilter}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
index 27cef33..b8a987a 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
@@ -27,11 +27,11 @@ import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.junit.Ignore;
import org.junit.Test;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
index 3429d86..b35523e 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@@ -23,12 +23,6 @@ import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
-import morfologik.stemming.Dictionary;
-import morfologik.stemming.DictionaryLookup;
-import morfologik.stemming.IStemmer;
-import morfologik.stemming.WordData;
-import morfologik.stemming.polish.PolishStemmer;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -36,6 +30,12 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRefBuilder;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import morfologik.stemming.polish.PolishStemmer;
+
/**
* {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
* morphosyntactic (POS) tokens. Applies to Polish only.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
index d8967c7..c4294e3 100644
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@@ -22,13 +22,13 @@ import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
index bd1fc7b..5f0347b 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@@ -21,12 +21,12 @@ import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.en.PorterStemFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
index 999ce86..6ed4fda 100644
--- a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@@ -22,18 +22,18 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.stempel.StempelStemmer;
import org.apache.lucene.analysis.stempel.StempelFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.analysis.stempel.StempelStemmer;
import org.apache.lucene.util.IOUtils;
import org.egothor.stemmer.Trie;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
index b0ef008..c37cedb 100644
--- a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
+++ b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
@@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/common-build.xml
----------------------------------------------------------------------
diff --git a/lucene/common-build.xml b/lucene/common-build.xml
index b4074ac..94b7910 100644
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@@ -2585,4 +2585,34 @@ The following arguments can be provided to ant to alter its behaviour and target
</sequential>
</macrodef>
+ <macrodef name="run-jflex">
+ <attribute name="dir"/>
+ <attribute name="name"/>
+ <sequential>
+ <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+ </sequential>
+ </macrodef>
+
+ <macrodef name="run-jflex-and-disable-buffer-expansion">
+ <attribute name="dir"/>
+ <attribute name="name"/>
+ <sequential>
+ <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+ <!-- LUCENE-5897: Disallow scanner buffer expansion -->
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
+ replace="" flags="s" />
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="private static final int ZZ_BUFFERSIZE ="
+ replace="private int ZZ_BUFFERSIZE ="/>
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="int requested = zzBuffer.length - zzEndRead;"
+ replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="(zzFinalHighSurrogate = 1;)(\r?\n)"
+ replace="\1\2 if (totalRead == 1) { return true; }\2"/>
+ </sequential>
+ </macrodef>
+
+
</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/build.xml
----------------------------------------------------------------------
diff --git a/lucene/core/build.xml b/lucene/core/build.xml
index 90da238..4e62e1c 100644
--- a/lucene/core/build.xml
+++ b/lucene/core/build.xml
@@ -133,7 +133,7 @@
<delete file="${build.dir}/moman.zip"/>
</target>
- <target name="regenerate" depends="createLevAutomata,createPackedIntSources"/>
+ <target name="regenerate" depends="createLevAutomata,createPackedIntSources,jflex"/>
<macrodef name="startLockStressTestClient">
<attribute name="clientId"/>
@@ -223,4 +223,20 @@
<target name="test" depends="common.test, test-lock-factory"/>
+ <target name="clean-jflex">
+ <delete>
+ <fileset dir="src/java/org/apache/lucene/analysis/standard" includes="**/*.java">
+ <containsregexp expression="generated.*by.*JFlex"/>
+ </fileset>
+ </delete>
+ </target>
+
+ <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-StandardAnalyzer"/>
+
+ <target name="-jflex-StandardAnalyzer" depends="init,-install-jflex">
+ <run-jflex-and-disable-buffer-expansion
+ dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
+ </target>
+
+
</project>
[07/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
deleted file mode 100644
index 4a3731e..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/WordBreakTestUnicode_6_3_0.java
+++ /dev/null
@@ -1,5537 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.standard;
-
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
-
-/**
- * This class was automatically generated by generateJavaUnicodeWordBreakTest.pl
- * from: http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakTest.txt
- *
- * WordBreakTest.txt indicates the points in the provided character sequences
- * at which conforming implementations must and must not break words. This
- * class tests for expected token extraction from each of the test sequences
- * in WordBreakTest.txt, where the expected tokens are those character
- * sequences bounded by word breaks and containing at least one character
- * from one of the following character sets:
- *
- * \p{Script = Han} (From http://www.unicode.org/Public/6.3.0/ucd/Scripts.txt)
- * \p{Script = Hiragana}
- * \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.3.0/ucd/LineBreak.txt)
- * \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt)
- * \p{WordBreak = Hebrew_Letter}
- * \p{WordBreak = Katakana}
- * \p{WordBreak = Numeric} (Excludes full-width Arabic digits)
- * [\uFF10-\uFF19] (Full-width Arabic digits)
- */
-@Ignore
-public class WordBreakTestUnicode_6_3_0 extends BaseTokenStreamTestCase {
-
- public void test(Analyzer analyzer) throws Exception {
- // � 0001 � 0001 � # � [0.2] <START OF HEADING> (Other) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0001",
- new String[] { });
-
- // � 0001 � 0308 � 0001 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0001",
- new String[] { });
-
- // � 0001 � 000D � # � [0.2] <START OF HEADING> (Other) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\r",
- new String[] { });
-
- // � 0001 � 0308 � 000D � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\r",
- new String[] { });
-
- // � 0001 � 000A � # � [0.2] <START OF HEADING> (Other) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\n",
- new String[] { });
-
- // � 0001 � 0308 � 000A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\n",
- new String[] { });
-
- // � 0001 � 000B � # � [0.2] <START OF HEADING> (Other) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u000B",
- new String[] { });
-
- // � 0001 � 0308 � 000B � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u000B",
- new String[] { });
-
- // � 0001 � 3031 � # � [0.2] <START OF HEADING> (Other) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u3031",
- new String[] { "\u3031" });
-
- // � 0001 � 0308 � 3031 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u3031",
- new String[] { "\u3031" });
-
- // � 0001 � 0041 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0041",
- new String[] { "\u0041" });
-
- // � 0001 � 0308 � 0041 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0041",
- new String[] { "\u0041" });
-
- // � 0001 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u003A",
- new String[] { });
-
- // � 0001 � 0308 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u003A",
- new String[] { });
-
- // � 0001 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u002C",
- new String[] { });
-
- // � 0001 � 0308 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u002C",
- new String[] { });
-
- // � 0001 � 002E � # � [0.2] <START OF HEADING> (Other) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u002E",
- new String[] { });
-
- // � 0001 � 0308 � 002E � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u002E",
- new String[] { });
-
- // � 0001 � 0030 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0030",
- new String[] { "\u0030" });
-
- // � 0001 � 0308 � 0030 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0030",
- new String[] { "\u0030" });
-
- // � 0001 � 005F � # � [0.2] <START OF HEADING> (Other) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u005F",
- new String[] { });
-
- // � 0001 � 0308 � 005F � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u005F",
- new String[] { });
-
- // � 0001 � 1F1E6 � # � [0.2] <START OF HEADING> (Other) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\uD83C\uDDE6",
- new String[] { });
-
- // � 0001 � 0308 � 1F1E6 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\uD83C\uDDE6",
- new String[] { });
-
- // � 0001 � 05D0 � # � [0.2] <START OF HEADING> (Other) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u05D0",
- new String[] { "\u05D0" });
-
- // � 0001 � 0308 � 05D0 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u05D0",
- new String[] { "\u05D0" });
-
- // � 0001 � 0022 � # � [0.2] <START OF HEADING> (Other) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\"",
- new String[] { });
-
- // � 0001 � 0308 � 0022 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\"",
- new String[] { });
-
- // � 0001 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0027",
- new String[] { });
-
- // � 0001 � 0308 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0027",
- new String[] { });
-
- // � 0001 � 00AD � # � [0.2] <START OF HEADING> (Other) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u00AD",
- new String[] { });
-
- // � 0001 � 0308 � 00AD � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u00AD",
- new String[] { });
-
- // � 0001 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0300",
- new String[] { });
-
- // � 0001 � 0308 � 0300 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0300",
- new String[] { });
-
- // � 0001 � 0061 � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 0001 � 0308 � 0061 � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 0001 � 0061 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 0001 � 0308 � 0061 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 0001 � 0061 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 0001 � 0308 � 0061 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 0001 � 0061 � 0027 � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 0001 � 0308 � 0061 � 0027 � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 0001 � 0061 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 0001 � 0308 � 0061 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 0001 � 0031 � 003A � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 0001 � 0308 � 0031 � 003A � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 0001 � 0031 � 0027 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 0001 � 0308 � 0031 � 0027 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 0001 � 0031 � 002C � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 0001 � 0308 � 0031 � 002C � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 0001 � 0031 � 002E � 2060 � # � [0.2] <START OF HEADING> (Other) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 0001 � 0308 � 0031 � 002E � 2060 � # � [0.2] <START OF HEADING> (Other) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0001\u0308\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000D � 0001 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0001",
- new String[] { });
-
- // � 000D � 0308 � 0001 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0001",
- new String[] { });
-
- // � 000D � 000D � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\r\r",
- new String[] { });
-
- // � 000D � 0308 � 000D � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\r",
- new String[] { });
-
- // � 000D � 000A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.0] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\r\n",
- new String[] { });
-
- // � 000D � 0308 � 000A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\n",
- new String[] { });
-
- // � 000D � 000B � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u000B",
- new String[] { });
-
- // � 000D � 0308 � 000B � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u000B",
- new String[] { });
-
- // � 000D � 3031 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u3031",
- new String[] { "\u3031" });
-
- // � 000D � 0308 � 3031 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u3031",
- new String[] { "\u3031" });
-
- // � 000D � 0041 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0041",
- new String[] { "\u0041" });
-
- // � 000D � 0308 � 0041 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0041",
- new String[] { "\u0041" });
-
- // � 000D � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u003A",
- new String[] { });
-
- // � 000D � 0308 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u003A",
- new String[] { });
-
- // � 000D � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u002C",
- new String[] { });
-
- // � 000D � 0308 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u002C",
- new String[] { });
-
- // � 000D � 002E � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u002E",
- new String[] { });
-
- // � 000D � 0308 � 002E � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u002E",
- new String[] { });
-
- // � 000D � 0030 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0030",
- new String[] { "\u0030" });
-
- // � 000D � 0308 � 0030 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0030",
- new String[] { "\u0030" });
-
- // � 000D � 005F � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u005F",
- new String[] { });
-
- // � 000D � 0308 � 005F � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u005F",
- new String[] { });
-
- // � 000D � 1F1E6 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\r\uD83C\uDDE6",
- new String[] { });
-
- // � 000D � 0308 � 1F1E6 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\uD83C\uDDE6",
- new String[] { });
-
- // � 000D � 05D0 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u05D0",
- new String[] { "\u05D0" });
-
- // � 000D � 0308 � 05D0 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u05D0",
- new String[] { "\u05D0" });
-
- // � 000D � 0022 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\"",
- new String[] { });
-
- // � 000D � 0308 � 0022 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\"",
- new String[] { });
-
- // � 000D � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0027",
- new String[] { });
-
- // � 000D � 0308 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0027",
- new String[] { });
-
- // � 000D � 00AD � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u00AD",
- new String[] { });
-
- // � 000D � 0308 � 00AD � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u00AD",
- new String[] { });
-
- // � 000D � 0300 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0300",
- new String[] { });
-
- // � 000D � 0308 � 0300 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0300",
- new String[] { });
-
- // � 000D � 0061 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000D � 0308 � 0061 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000D � 0061 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000D � 0308 � 0061 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000D � 0061 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000D � 0308 � 0061 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000D � 0061 � 0027 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000D � 0308 � 0061 � 0027 � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000D � 0061 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000D � 0308 � 0061 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000D � 0031 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000D � 0308 � 0031 � 003A � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000D � 0031 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000D � 0308 � 0031 � 0027 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000D � 0031 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000D � 0308 � 0031 � 002C � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000D � 0031 � 002E � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000D � 0308 � 0031 � 002E � 2060 � # � [0.2] <CARRIAGE RETURN (CR)> (CR) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\r\u0308\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000A � 0001 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0001",
- new String[] { });
-
- // � 000A � 0308 � 0001 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0001",
- new String[] { });
-
- // � 000A � 000D � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\n\r",
- new String[] { });
-
- // � 000A � 0308 � 000D � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\r",
- new String[] { });
-
- // � 000A � 000A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\n\n",
- new String[] { });
-
- // � 000A � 0308 � 000A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\n",
- new String[] { });
-
- // � 000A � 000B � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u000B",
- new String[] { });
-
- // � 000A � 0308 � 000B � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u000B",
- new String[] { });
-
- // � 000A � 3031 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u3031",
- new String[] { "\u3031" });
-
- // � 000A � 0308 � 3031 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u3031",
- new String[] { "\u3031" });
-
- // � 000A � 0041 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0041",
- new String[] { "\u0041" });
-
- // � 000A � 0308 � 0041 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0041",
- new String[] { "\u0041" });
-
- // � 000A � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u003A",
- new String[] { });
-
- // � 000A � 0308 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u003A",
- new String[] { });
-
- // � 000A � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u002C",
- new String[] { });
-
- // � 000A � 0308 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u002C",
- new String[] { });
-
- // � 000A � 002E � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u002E",
- new String[] { });
-
- // � 000A � 0308 � 002E � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u002E",
- new String[] { });
-
- // � 000A � 0030 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0030",
- new String[] { "\u0030" });
-
- // � 000A � 0308 � 0030 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0030",
- new String[] { "\u0030" });
-
- // � 000A � 005F � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u005F",
- new String[] { });
-
- // � 000A � 0308 � 005F � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u005F",
- new String[] { });
-
- // � 000A � 1F1E6 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\n\uD83C\uDDE6",
- new String[] { });
-
- // � 000A � 0308 � 1F1E6 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\uD83C\uDDE6",
- new String[] { });
-
- // � 000A � 05D0 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u05D0",
- new String[] { "\u05D0" });
-
- // � 000A � 0308 � 05D0 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u05D0",
- new String[] { "\u05D0" });
-
- // � 000A � 0022 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\"",
- new String[] { });
-
- // � 000A � 0308 � 0022 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\"",
- new String[] { });
-
- // � 000A � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0027",
- new String[] { });
-
- // � 000A � 0308 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0027",
- new String[] { });
-
- // � 000A � 00AD � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u00AD",
- new String[] { });
-
- // � 000A � 0308 � 00AD � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u00AD",
- new String[] { });
-
- // � 000A � 0300 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0300",
- new String[] { });
-
- // � 000A � 0308 � 0300 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0300",
- new String[] { });
-
- // � 000A � 0061 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000A � 0308 � 0061 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000A � 0061 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000A � 0308 � 0061 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000A � 0061 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000A � 0308 � 0061 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000A � 0061 � 0027 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000A � 0308 � 0061 � 0027 � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000A � 0061 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000A � 0308 � 0061 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000A � 0031 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000A � 0308 � 0031 � 003A � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000A � 0031 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000A � 0308 � 0031 � 0027 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000A � 0031 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000A � 0308 � 0031 � 002C � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000A � 0031 � 002E � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000A � 0308 � 0031 � 002E � 2060 � # � [0.2] <LINE FEED (LF)> (LF) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\n\u0308\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000B � 0001 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0001",
- new String[] { });
-
- // � 000B � 0308 � 0001 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0001",
- new String[] { });
-
- // � 000B � 000D � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\r",
- new String[] { });
-
- // � 000B � 0308 � 000D � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\r",
- new String[] { });
-
- // � 000B � 000A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\n",
- new String[] { });
-
- // � 000B � 0308 � 000A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\n",
- new String[] { });
-
- // � 000B � 000B � # � [0.2] <LINE TABULATION> (Newline) � [3.1] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u000B",
- new String[] { });
-
- // � 000B � 0308 � 000B � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u000B",
- new String[] { });
-
- // � 000B � 3031 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u3031",
- new String[] { "\u3031" });
-
- // � 000B � 0308 � 3031 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u3031",
- new String[] { "\u3031" });
-
- // � 000B � 0041 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0041",
- new String[] { "\u0041" });
-
- // � 000B � 0308 � 0041 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0041",
- new String[] { "\u0041" });
-
- // � 000B � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u003A",
- new String[] { });
-
- // � 000B � 0308 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u003A",
- new String[] { });
-
- // � 000B � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u002C",
- new String[] { });
-
- // � 000B � 0308 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u002C",
- new String[] { });
-
- // � 000B � 002E � # � [0.2] <LINE TABULATION> (Newline) � [3.1] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u002E",
- new String[] { });
-
- // � 000B � 0308 � 002E � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u002E",
- new String[] { });
-
- // � 000B � 0030 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0030",
- new String[] { "\u0030" });
-
- // � 000B � 0308 � 0030 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0030",
- new String[] { "\u0030" });
-
- // � 000B � 005F � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u005F",
- new String[] { });
-
- // � 000B � 0308 � 005F � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u005F",
- new String[] { });
-
- // � 000B � 1F1E6 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\uD83C\uDDE6",
- new String[] { });
-
- // � 000B � 0308 � 1F1E6 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\uD83C\uDDE6",
- new String[] { });
-
- // � 000B � 05D0 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u05D0",
- new String[] { "\u05D0" });
-
- // � 000B � 0308 � 05D0 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u05D0",
- new String[] { "\u05D0" });
-
- // � 000B � 0022 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\"",
- new String[] { });
-
- // � 000B � 0308 � 0022 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\"",
- new String[] { });
-
- // � 000B � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0027",
- new String[] { });
-
- // � 000B � 0308 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0027",
- new String[] { });
-
- // � 000B � 00AD � # � [0.2] <LINE TABULATION> (Newline) � [3.1] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u00AD",
- new String[] { });
-
- // � 000B � 0308 � 00AD � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u00AD",
- new String[] { });
-
- // � 000B � 0300 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0300",
- new String[] { });
-
- // � 000B � 0308 � 0300 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0300",
- new String[] { });
-
- // � 000B � 0061 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000B � 0308 � 0061 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u2060",
- new String[] { "\u0061\u2060" });
-
- // � 000B � 0061 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000B � 0308 � 0061 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u003A",
- new String[] { "\u0061" });
-
- // � 000B � 0061 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000B � 0308 � 0061 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u0027",
- new String[] { "\u0061" });
-
- // � 000B � 0061 � 0027 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000B � 0308 � 0061 � 0027 � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u0027\u2060",
- new String[] { "\u0061" });
-
- // � 000B � 0061 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000B � 0308 � 0061 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0061\u002C",
- new String[] { "\u0061" });
-
- // � 000B � 0031 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000B � 0308 � 0031 � 003A � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u003A",
- new String[] { "\u0031" });
-
- // � 000B � 0031 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000B � 0308 � 0031 � 0027 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u0027",
- new String[] { "\u0031" });
-
- // � 000B � 0031 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000B � 0308 � 0031 � 002C � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u002C",
- new String[] { "\u0031" });
-
- // � 000B � 0031 � 002E � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 000B � 0308 � 0031 � 002E � 2060 � # � [0.2] <LINE TABULATION> (Newline) � [3.1] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u000B\u0308\u0031\u002E\u2060",
- new String[] { "\u0031" });
-
- // � 3031 � 0001 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0001",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 0001 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0001",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 000D � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\r",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 000D � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\r",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 000A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\n",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 000A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\n",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 000B � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u000B",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 000B � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u000B",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 3031 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [13.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u3031",
- new String[] { "\u3031\u3031" });
-
- // � 3031 � 0308 � 3031 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u3031",
- new String[] { "\u3031\u0308\u3031" });
-
- // � 3031 � 0041 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0041",
- new String[] { "\u3031", "\u0041" });
-
- // � 3031 � 0308 � 0041 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0041",
- new String[] { "\u3031\u0308", "\u0041" });
-
- // � 3031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u003A",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u003A",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u002C",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u002C",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 002E � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u002E",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 002E � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u002E",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 0030 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0030",
- new String[] { "\u3031", "\u0030" });
-
- // � 3031 � 0308 � 0030 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0030",
- new String[] { "\u3031\u0308", "\u0030" });
-
- // � 3031 � 005F � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u005F",
- new String[] { "\u3031\u005F" });
-
- // � 3031 � 0308 � 005F � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u005F",
- new String[] { "\u3031\u0308\u005F" });
-
- // � 3031 � 1F1E6 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\uD83C\uDDE6",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 1F1E6 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\uD83C\uDDE6",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 05D0 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u05D0",
- new String[] { "\u3031", "\u05D0" });
-
- // � 3031 � 0308 � 05D0 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u05D0",
- new String[] { "\u3031\u0308", "\u05D0" });
-
- // � 3031 � 0022 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\"",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 0022 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\"",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0027",
- new String[] { "\u3031" });
-
- // � 3031 � 0308 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0027",
- new String[] { "\u3031\u0308" });
-
- // � 3031 � 00AD � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u00AD",
- new String[] { "\u3031\u00AD" });
-
- // � 3031 � 0308 � 00AD � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u00AD",
- new String[] { "\u3031\u0308\u00AD" });
-
- // � 3031 � 0300 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0300",
- new String[] { "\u3031\u0300" });
-
- // � 3031 � 0308 � 0300 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0300",
- new String[] { "\u3031\u0308\u0300" });
-
- // � 3031 � 0061 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0061\u2060",
- new String[] { "\u3031", "\u0061\u2060" });
-
- // � 3031 � 0308 � 0061 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u2060",
- new String[] { "\u3031\u0308", "\u0061\u2060" });
-
- // � 3031 � 0061 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0061\u003A",
- new String[] { "\u3031", "\u0061" });
-
- // � 3031 � 0308 � 0061 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u003A",
- new String[] { "\u3031\u0308", "\u0061" });
-
- // � 3031 � 0061 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0061\u0027",
- new String[] { "\u3031", "\u0061" });
-
- // � 3031 � 0308 � 0061 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u0027",
- new String[] { "\u3031\u0308", "\u0061" });
-
- // � 3031 � 0061 � 0027 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0061\u0027\u2060",
- new String[] { "\u3031", "\u0061" });
-
- // � 3031 � 0308 � 0061 � 0027 � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u0027\u2060",
- new String[] { "\u3031\u0308", "\u0061" });
-
- // � 3031 � 0061 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0061\u002C",
- new String[] { "\u3031", "\u0061" });
-
- // � 3031 � 0308 � 0061 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0061\u002C",
- new String[] { "\u3031\u0308", "\u0061" });
-
- // � 3031 � 0031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0031\u003A",
- new String[] { "\u3031", "\u0031" });
-
- // � 3031 � 0308 � 0031 � 003A � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u003A",
- new String[] { "\u3031\u0308", "\u0031" });
-
- // � 3031 � 0031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0031\u0027",
- new String[] { "\u3031", "\u0031" });
-
- // � 3031 � 0308 � 0031 � 0027 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u0027",
- new String[] { "\u3031\u0308", "\u0031" });
-
- // � 3031 � 0031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0031\u002C",
- new String[] { "\u3031", "\u0031" });
-
- // � 3031 � 0308 � 0031 � 002C � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u002C",
- new String[] { "\u3031\u0308", "\u0031" });
-
- // � 3031 � 0031 � 002E � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0031\u002E\u2060",
- new String[] { "\u3031", "\u0031" });
-
- // � 3031 � 0308 � 0031 � 002E � 2060 � # � [0.2] VERTICAL KANA REPEAT MARK (Katakana) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u3031\u0308\u0031\u002E\u2060",
- new String[] { "\u3031\u0308", "\u0031" });
-
- // � 0041 � 0001 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0001",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 0001 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0001",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 000D � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\r",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 000D � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\r",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 000A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\n",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 000A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\n",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 000B � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u000B",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 000B � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u000B",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 3031 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u3031",
- new String[] { "\u0041", "\u3031" });
-
- // � 0041 � 0308 � 3031 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u3031",
- new String[] { "\u0041\u0308", "\u3031" });
-
- // � 0041 � 0041 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0041",
- new String[] { "\u0041\u0041" });
-
- // � 0041 � 0308 � 0041 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0041",
- new String[] { "\u0041\u0308\u0041" });
-
- // � 0041 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u003A",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u003A",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u002C",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u002C",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 002E � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u002E",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 002E � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u002E",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 0030 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0030",
- new String[] { "\u0041\u0030" });
-
- // � 0041 � 0308 � 0030 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0030",
- new String[] { "\u0041\u0308\u0030" });
-
- // � 0041 � 005F � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u005F",
- new String[] { "\u0041\u005F" });
-
- // � 0041 � 0308 � 005F � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [13.1] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u005F",
- new String[] { "\u0041\u0308\u005F" });
-
- // � 0041 � 1F1E6 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\uD83C\uDDE6",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 1F1E6 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\uD83C\uDDE6",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 05D0 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u05D0",
- new String[] { "\u0041\u05D0" });
-
- // � 0041 � 0308 � 05D0 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u05D0",
- new String[] { "\u0041\u0308\u05D0" });
-
- // � 0041 � 0022 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\"",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 0022 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\"",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0027",
- new String[] { "\u0041" });
-
- // � 0041 � 0308 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0027",
- new String[] { "\u0041\u0308" });
-
- // � 0041 � 00AD � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u00AD",
- new String[] { "\u0041\u00AD" });
-
- // � 0041 � 0308 � 00AD � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u00AD",
- new String[] { "\u0041\u0308\u00AD" });
-
- // � 0041 � 0300 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0300",
- new String[] { "\u0041\u0300" });
-
- // � 0041 � 0308 � 0300 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0300",
- new String[] { "\u0041\u0308\u0300" });
-
- // � 0041 � 0061 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0061\u2060",
- new String[] { "\u0041\u0061\u2060" });
-
- // � 0041 � 0308 � 0061 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u2060",
- new String[] { "\u0041\u0308\u0061\u2060" });
-
- // � 0041 � 0061 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0061\u003A",
- new String[] { "\u0041\u0061" });
-
- // � 0041 � 0308 � 0061 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u003A",
- new String[] { "\u0041\u0308\u0061" });
-
- // � 0041 � 0061 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0061\u0027",
- new String[] { "\u0041\u0061" });
-
- // � 0041 � 0308 � 0061 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u0027",
- new String[] { "\u0041\u0308\u0061" });
-
- // � 0041 � 0061 � 0027 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0061\u0027\u2060",
- new String[] { "\u0041\u0061" });
-
- // � 0041 � 0308 � 0061 � 0027 � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] APOSTROPHE (Single_Quote) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u0027\u2060",
- new String[] { "\u0041\u0308\u0061" });
-
- // � 0041 � 0061 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0061\u002C",
- new String[] { "\u0041\u0061" });
-
- // � 0041 � 0308 � 0061 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [5.0] LATIN SMALL LETTER A (ALetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0061\u002C",
- new String[] { "\u0041\u0308\u0061" });
-
- // � 0041 � 0031 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0031\u003A",
- new String[] { "\u0041\u0031" });
-
- // � 0041 � 0308 � 0031 � 003A � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u003A",
- new String[] { "\u0041\u0308\u0031" });
-
- // � 0041 � 0031 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0031\u0027",
- new String[] { "\u0041\u0031" });
-
- // � 0041 � 0308 � 0031 � 0027 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u0027",
- new String[] { "\u0041\u0308\u0031" });
-
- // � 0041 � 0031 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0031\u002C",
- new String[] { "\u0041\u0031" });
-
- // � 0041 � 0308 � 0031 � 002C � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u002C",
- new String[] { "\u0041\u0308\u0031" });
-
- // � 0041 � 0031 � 002E � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [9.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0031\u002E\u2060",
- new String[] { "\u0041\u0031" });
-
- // � 0041 � 0308 � 0031 � 002E � 2060 � # � [0.2] LATIN CAPITAL LETTER A (ALetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [9.0] DIGIT ONE (Numeric) � [999.0] FULL STOP (MidNumLet) � [4.0] WORD JOINER (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u0041\u0308\u0031\u002E\u2060",
- new String[] { "\u0041\u0308\u0031" });
-
- // � 003A � 0001 � # � [0.2] COLON (MidLetter) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0001",
- new String[] { });
-
- // � 003A � 0308 � 0001 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] <START OF HEADING> (Other) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u0001",
- new String[] { });
-
- // � 003A � 000D � # � [0.2] COLON (MidLetter) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\r",
- new String[] { });
-
- // � 003A � 0308 � 000D � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <CARRIAGE RETURN (CR)> (CR) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\r",
- new String[] { });
-
- // � 003A � 000A � # � [0.2] COLON (MidLetter) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\n",
- new String[] { });
-
- // � 003A � 0308 � 000A � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE FEED (LF)> (LF) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\n",
- new String[] { });
-
- // � 003A � 000B � # � [0.2] COLON (MidLetter) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u000B",
- new String[] { });
-
- // � 003A � 0308 � 000B � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [3.2] <LINE TABULATION> (Newline) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u000B",
- new String[] { });
-
- // � 003A � 3031 � # � [0.2] COLON (MidLetter) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u3031",
- new String[] { "\u3031" });
-
- // � 003A � 0308 � 3031 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] VERTICAL KANA REPEAT MARK (Katakana) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u3031",
- new String[] { "\u3031" });
-
- // � 003A � 0041 � # � [0.2] COLON (MidLetter) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0041",
- new String[] { "\u0041" });
-
- // � 003A � 0308 � 0041 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LATIN CAPITAL LETTER A (ALetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u0041",
- new String[] { "\u0041" });
-
- // � 003A � 003A � # � [0.2] COLON (MidLetter) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u003A",
- new String[] { });
-
- // � 003A � 0308 � 003A � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COLON (MidLetter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u003A",
- new String[] { });
-
- // � 003A � 002C � # � [0.2] COLON (MidLetter) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u002C",
- new String[] { });
-
- // � 003A � 0308 � 002C � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] COMMA (MidNum) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u002C",
- new String[] { });
-
- // � 003A � 002E � # � [0.2] COLON (MidLetter) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u002E",
- new String[] { });
-
- // � 003A � 0308 � 002E � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] FULL STOP (MidNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u002E",
- new String[] { });
-
- // � 003A � 0030 � # � [0.2] COLON (MidLetter) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0030",
- new String[] { "\u0030" });
-
- // � 003A � 0308 � 0030 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] DIGIT ZERO (Numeric) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u0030",
- new String[] { "\u0030" });
-
- // � 003A � 005F � # � [0.2] COLON (MidLetter) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u005F",
- new String[] { });
-
- // � 003A � 0308 � 005F � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] LOW LINE (ExtendNumLet) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u005F",
- new String[] { });
-
- // � 003A � 1F1E6 � # � [0.2] COLON (MidLetter) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\uD83C\uDDE6",
- new String[] { });
-
- // � 003A � 0308 � 1F1E6 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\uD83C\uDDE6",
- new String[] { });
-
- // � 003A � 05D0 � # � [0.2] COLON (MidLetter) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u05D0",
- new String[] { "\u05D0" });
-
- // � 003A � 0308 � 05D0 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] HEBREW LETTER ALEF (Hebrew_Letter) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u05D0",
- new String[] { "\u05D0" });
-
- // � 003A � 0022 � # � [0.2] COLON (MidLetter) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\"",
- new String[] { });
-
- // � 003A � 0308 � 0022 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] QUOTATION MARK (Double_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\"",
- new String[] { });
-
- // � 003A � 0027 � # � [0.2] COLON (MidLetter) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0027",
- new String[] { });
-
- // � 003A � 0308 � 0027 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [999.0] APOSTROPHE (Single_Quote) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u0027",
- new String[] { });
-
- // � 003A � 00AD � # � [0.2] COLON (MidLetter) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u00AD",
- new String[] { });
-
- // � 003A � 0308 � 00AD � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] SOFT HYPHEN (Format_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u00AD",
- new String[] { });
-
- // � 003A � 0300 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0300",
- new String[] { });
-
- // � 003A � 0308 � 0300 � # � [0.2] COLON (MidLetter) � [4.0] COMBINING DIAERESIS (Extend_FE) � [4.0] COMBINING GRAVE ACCENT (Extend_FE) � [0.3]
- assertAnalyzesTo(analyzer, "\u003A\u0308\u0300",
- new
<TRUNCATED>
[12/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
Posted by mi...@apache.org.
LUCENE-7318: graduate StandardAnalyzer and make it the default for IndexWriterConfig
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ba922148
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ba922148
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ba922148
Branch: refs/heads/branch_6x
Commit: ba922148307248893bf70d02b28efdec9882f348
Parents: 45d2d2e
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Jun 14 16:38:04 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Jun 14 18:41:37 2016 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
lucene/analysis/common/build.xml | 39 +-
.../lucene/analysis/ar/ArabicAnalyzer.java | 12 +-
.../lucene/analysis/bg/BulgarianAnalyzer.java | 10 +-
.../lucene/analysis/br/BrazilianAnalyzer.java | 10 +-
.../lucene/analysis/ca/CatalanAnalyzer.java | 10 +-
.../charfilter/HTMLStripCharFilter.java | 4 +-
.../apache/lucene/analysis/cjk/CJKAnalyzer.java | 8 +-
.../lucene/analysis/ckb/SoraniAnalyzer.java | 14 +-
.../analysis/commongrams/CommonGramsFilter.java | 4 +-
.../commongrams/CommonGramsFilterFactory.java | 3 +-
.../compound/CompoundWordTokenFilterBase.java | 8 +-
.../DictionaryCompoundWordTokenFilter.java | 2 +-
...ictionaryCompoundWordTokenFilterFactory.java | 8 +-
.../HyphenationCompoundWordTokenFilter.java | 7 +-
...phenationCompoundWordTokenFilterFactory.java | 11 +-
.../lucene/analysis/core/LowerCaseFilter.java | 50 -
.../analysis/core/LowerCaseFilterFactory.java | 2 +-
.../lucene/analysis/core/SimpleAnalyzer.java | 1 +
.../lucene/analysis/core/StopAnalyzer.java | 25 +-
.../apache/lucene/analysis/core/StopFilter.java | 111 -
.../lucene/analysis/core/StopFilterFactory.java | 11 +-
.../lucene/analysis/core/TypeTokenFilter.java | 2 +-
.../lucene/analysis/core/UpperCaseFilter.java | 3 +-
.../lucene/analysis/cz/CzechAnalyzer.java | 18 +-
.../lucene/analysis/da/DanishAnalyzer.java | 12 +-
.../lucene/analysis/de/GermanAnalyzer.java | 12 +-
.../lucene/analysis/el/GreekAnalyzer.java | 6 +-
.../apache/lucene/analysis/el/GreekStemmer.java | 2 +-
.../lucene/analysis/en/EnglishAnalyzer.java | 8 +-
.../org/apache/lucene/analysis/en/KStemmer.java | 2 +-
.../lucene/analysis/es/SpanishAnalyzer.java | 12 +-
.../lucene/analysis/eu/BasqueAnalyzer.java | 10 +-
.../lucene/analysis/fa/PersianAnalyzer.java | 8 +-
.../lucene/analysis/fi/FinnishAnalyzer.java | 12 +-
.../lucene/analysis/fr/FrenchAnalyzer.java | 24 +-
.../lucene/analysis/ga/IrishAnalyzer.java | 8 +-
.../lucene/analysis/gl/GalicianAnalyzer.java | 12 +-
.../lucene/analysis/hi/HindiAnalyzer.java | 12 +-
.../lucene/analysis/hu/HungarianAnalyzer.java | 12 +-
.../lucene/analysis/hunspell/Stemmer.java | 2 +-
.../lucene/analysis/hy/ArmenianAnalyzer.java | 10 +-
.../lucene/analysis/id/IndonesianAnalyzer.java | 10 +-
.../lucene/analysis/it/ItalianAnalyzer.java | 12 +-
.../lucene/analysis/lt/LithuanianAnalyzer.java | 10 +-
.../lucene/analysis/lv/LatvianAnalyzer.java | 12 +-
.../miscellaneous/CapitalizationFilter.java | 2 +-
.../CapitalizationFilterFactory.java | 8 +-
.../miscellaneous/CodepointCountFilter.java | 2 +-
.../miscellaneous/DateRecognizerFilter.java | 2 +-
.../miscellaneous/FingerprintFilter.java | 2 +-
.../analysis/miscellaneous/KeepWordFilter.java | 4 +-
.../miscellaneous/KeepWordFilterFactory.java | 8 +-
.../KeywordMarkerFilterFactory.java | 2 +-
.../analysis/miscellaneous/LengthFilter.java | 2 +-
.../RemoveDuplicatesTokenFilter.java | 2 +-
.../miscellaneous/SetKeywordMarkerFilter.java | 2 +-
.../miscellaneous/WordDelimiterFilter.java | 10 +-
.../WordDelimiterFilterFactory.java | 16 +-
.../lucene/analysis/ngram/NGramTokenizer.java | 2 +-
.../lucene/analysis/nl/DutchAnalyzer.java | 22 +-
.../lucene/analysis/no/NorwegianAnalyzer.java | 12 +-
.../lucene/analysis/pt/PortugueseAnalyzer.java | 12 +-
.../lucene/analysis/pt/RSLPStemmerBase.java | 2 +-
.../query/QueryAutoStopWordAnalyzer.java | 4 +-
.../lucene/analysis/ro/RomanianAnalyzer.java | 10 +-
.../lucene/analysis/ru/RussianAnalyzer.java | 16 +-
.../analysis/snowball/SnowballFilter.java | 4 +-
.../snowball/SnowballPorterFilterFactory.java | 6 +-
.../analysis/standard/ClassicAnalyzer.java | 10 +-
.../analysis/standard/StandardAnalyzer.java | 98 -
.../analysis/standard/StandardFilter.java | 38 -
.../analysis/standard/StandardTokenizer.java | 201 -
.../standard/StandardTokenizerImpl.java | 818 ---
.../standard/StandardTokenizerImpl.jflex | 201 -
.../standard/UAX29URLEmailAnalyzer.java | 14 +-
.../lucene/analysis/standard/package-info.java | 63 -
.../lucene/analysis/standard/package.html | 50 +
.../lucene/analysis/sv/SwedishAnalyzer.java | 12 +-
.../analysis/synonym/SynonymFilterFactory.java | 2 +-
.../apache/lucene/analysis/th/ThaiAnalyzer.java | 8 +-
.../lucene/analysis/tr/TurkishAnalyzer.java | 8 +-
.../analysis/util/AbstractAnalysisFactory.java | 4 +-
.../lucene/analysis/util/CharArrayMap.java | 669 ---
.../lucene/analysis/util/CharArraySet.java | 193 -
.../lucene/analysis/util/CharTokenizer.java | 10 +-
.../lucene/analysis/util/CharacterUtils.java | 251 -
.../lucene/analysis/util/ElisionFilter.java | 2 +-
.../analysis/util/ElisionFilterFactory.java | 1 +
.../analysis/util/FilteringTokenFilter.java | 76 -
.../analysis/util/StopwordAnalyzerBase.java | 138 -
.../lucene/analysis/util/WordlistLoader.java | 244 -
.../apache/lucene/collation/package-info.java | 2 +-
.../lucene/analysis/ar/TestArabicAnalyzer.java | 2 +-
.../analysis/ar/TestArabicStemFilter.java | 2 +-
.../analysis/bg/TestBulgarianAnalyzer.java | 2 +-
.../analysis/bg/TestBulgarianStemmer.java | 2 +-
.../analysis/br/TestBrazilianAnalyzer.java | 2 +-
.../lucene/analysis/ca/TestCatalanAnalyzer.java | 2 +-
.../lucene/analysis/cjk/TestCJKAnalyzer.java | 4 +-
.../lucene/analysis/ckb/TestSoraniAnalyzer.java | 2 +-
.../commongrams/CommonGramsFilterTest.java | 2 +-
.../TestCommonGramsFilterFactory.java | 10 +-
.../TestCommonGramsQueryFilterFactory.java | 7 +-
.../compound/TestCompoundWordTokenFilter.java | 2 +-
.../lucene/analysis/core/TestAnalyzers.java | 1 +
.../analysis/core/TestBugInSomething.java | 2 +-
.../lucene/analysis/core/TestRandomChains.java | 6 +-
.../lucene/analysis/core/TestStopAnalyzer.java | 14 +-
.../lucene/analysis/core/TestStopFilter.java | 176 -
.../analysis/core/TestStopFilterFactory.java | 2 +-
.../lucene/analysis/cz/TestCzechAnalyzer.java | 4 +-
.../lucene/analysis/cz/TestCzechStemmer.java | 2 +-
.../lucene/analysis/da/TestDanishAnalyzer.java | 2 +-
.../lucene/analysis/de/TestGermanAnalyzer.java | 2 +-
.../analysis/de/TestGermanLightStemFilter.java | 2 +-
.../de/TestGermanMinimalStemFilter.java | 2 +-
.../analysis/de/TestGermanStemFilter.java | 4 +-
.../lucene/analysis/en/TestEnglishAnalyzer.java | 2 +-
.../analysis/en/TestPorterStemFilter.java | 8 +-
.../lucene/analysis/es/TestSpanishAnalyzer.java | 2 +-
.../lucene/analysis/eu/TestBasqueAnalyzer.java | 2 +-
.../lucene/analysis/fa/TestPersianAnalyzer.java | 2 +-
.../lucene/analysis/fi/TestFinnishAnalyzer.java | 2 +-
.../analysis/fi/TestFinnishLightStemFilter.java | 2 +-
.../lucene/analysis/fr/TestFrenchAnalyzer.java | 2 +-
.../analysis/fr/TestFrenchLightStemFilter.java | 2 +-
.../fr/TestFrenchMinimalStemFilter.java | 2 +-
.../lucene/analysis/ga/TestIrishAnalyzer.java | 2 +-
.../analysis/gl/TestGalicianAnalyzer.java | 2 +-
.../gl/TestGalicianMinimalStemFilter.java | 2 +-
.../lucene/analysis/hi/TestHindiAnalyzer.java | 2 +-
.../analysis/hu/TestHungarianAnalyzer.java | 2 +-
.../hu/TestHungarianLightStemFilter.java | 2 +-
.../hunspell/TestHunspellStemFilter.java | 2 +-
.../analysis/hy/TestArmenianAnalyzer.java | 2 +-
.../analysis/id/TestIndonesianAnalyzer.java | 2 +-
.../lucene/analysis/it/TestItalianAnalyzer.java | 2 +-
.../analysis/lt/TestLithuanianAnalyzer.java | 2 +-
.../lucene/analysis/lv/TestLatvianAnalyzer.java | 2 +-
.../miscellaneous/TestCapitalizationFilter.java | 2 +-
.../miscellaneous/TestKeepFilterFactory.java | 4 +-
.../miscellaneous/TestKeepWordFilter.java | 2 +-
.../miscellaneous/TestKeywordMarkerFilter.java | 4 +-
.../TestStemmerOverrideFilter.java | 2 +-
.../miscellaneous/TestWordDelimiterFilter.java | 12 +-
.../lucene/analysis/nl/TestDutchAnalyzer.java | 6 +-
.../analysis/no/TestNorwegianAnalyzer.java | 2 +-
.../no/TestNorwegianLightStemFilter.java | 2 +-
.../no/TestNorwegianMinimalStemFilter.java | 2 +-
.../analysis/pt/TestPortugueseAnalyzer.java | 2 +-
.../pt/TestPortugueseLightStemFilter.java | 2 +-
.../pt/TestPortugueseMinimalStemFilter.java | 2 +-
.../analysis/pt/TestPortugueseStemFilter.java | 6 +-
.../analysis/ro/TestRomanianAnalyzer.java | 2 +-
.../lucene/analysis/ru/TestRussianAnalyzer.java | 4 +-
.../analysis/ru/TestRussianLightStemFilter.java | 2 +-
.../shingle/ShingleAnalyzerWrapperTest.java | 4 +-
.../analysis/sinks/TestTeeSinkTokenFilter.java | 4 +-
.../analysis/standard/TestStandardAnalyzer.java | 390 --
.../standard/WordBreakTestUnicode_6_3_0.java | 5537 ------------------
.../generateJavaUnicodeWordBreakTest.pl | 232 -
.../lucene/analysis/sv/TestSwedishAnalyzer.java | 2 +-
.../analysis/sv/TestSwedishLightStemFilter.java | 2 +-
.../lucene/analysis/th/TestThaiAnalyzer.java | 2 +-
.../lucene/analysis/tr/TestTurkishAnalyzer.java | 2 +-
.../lucene/analysis/util/TestCharArrayMap.java | 244 -
.../lucene/analysis/util/TestCharArraySet.java | 429 --
.../analysis/util/TestCharacterUtils.java | 107 -
.../lucene/analysis/util/TestElision.java | 2 +-
.../util/TestFilesystemResourceLoader.java | 2 +
.../analysis/util/TestWordlistLoader.java | 79 -
lucene/analysis/icu/src/java/overview.html | 2 +-
.../segmentation/TestWithCJKBigramFilter.java | 4 +-
.../lucene/analysis/ja/JapaneseAnalyzer.java | 8 +-
.../analysis/ja/JapaneseNumberFilter.java | 4 +-
.../ja/JapanesePartOfSpeechStopFilter.java | 4 +-
.../JapanesePartOfSpeechStopFilterFactory.java | 2 +-
.../analysis/ja/TestJapaneseBaseFormFilter.java | 2 +-
.../ja/TestJapaneseKatakanaStemFilter.java | 6 +-
.../analysis/ja/TestJapaneseNumberFilter.java | 2 +-
.../analysis/morfologik/MorfologikFilter.java | 12 +-
.../morfologik/TestMorfologikAnalyzer.java | 2 +-
.../analysis/cn/smart/SmartChineseAnalyzer.java | 6 +-
.../lucene/analysis/pl/PolishAnalyzer.java | 14 +-
.../lucene/analysis/pl/TestPolishAnalyzer.java | 2 +-
lucene/common-build.xml | 30 +
lucene/core/build.xml | 18 +-
.../apache/lucene/analysis/CharArrayMap.java | 669 +++
.../apache/lucene/analysis/CharArraySet.java | 196 +
.../apache/lucene/analysis/CharacterUtils.java | 251 +
.../lucene/analysis/FilteringTokenFilter.java | 76 +
.../apache/lucene/analysis/LowerCaseFilter.java | 50 +
.../org/apache/lucene/analysis/StopFilter.java | 111 +
.../lucene/analysis/StopwordAnalyzerBase.java | 138 +
.../apache/lucene/analysis/WordlistLoader.java | 244 +
.../apache/lucene/analysis/package-info.java | 2 +-
.../analysis/standard/StandardAnalyzer.java | 115 +
.../analysis/standard/StandardFilter.java | 39 +
.../analysis/standard/StandardTokenizer.java | 213 +
.../standard/StandardTokenizerImpl.java | 823 +++
.../standard/StandardTokenizerImpl.jflex | 206 +
.../lucene/analysis/standard/package-info.java | 33 +
.../apache/lucene/index/IndexWriterConfig.java | 17 +-
.../org/apache/lucene/util/packed/Direct16.java | 2 +-
.../org/apache/lucene/util/packed/Direct32.java | 2 +-
.../org/apache/lucene/util/packed/Direct64.java | 2 +-
.../org/apache/lucene/util/packed/Direct8.java | 2 +-
.../lucene/util/packed/Packed16ThreeBlocks.java | 2 +-
.../lucene/util/packed/Packed64SingleBlock.java | 2 +-
.../lucene/util/packed/Packed8ThreeBlocks.java | 2 +-
lucene/core/src/java/overview.html | 2 +-
.../lucene/analysis/TestCharArrayMap.java | 244 +
.../lucene/analysis/TestCharArraySet.java | 430 ++
.../lucene/analysis/TestCharacterUtils.java | 107 +
.../apache/lucene/analysis/TestStopFilter.java | 176 +
.../lucene/analysis/TestWordlistLoader.java | 79 +
.../analysis/standard/TestStandardAnalyzer.java | 390 ++
.../suggest/analyzing/SuggestStopFilter.java | 4 +-
.../analyzing/SuggestStopFilterFactory.java | 10 +-
.../analyzing/AnalyzingInfixSuggesterTest.java | 4 +-
.../analyzing/BlendedInfixSuggesterTest.java | 2 +-
.../analyzing/TestFreeTextSuggester.java | 6 +-
.../analyzing/TestSuggestStopFilter.java | 4 +-
.../analyzing/TestSuggestStopFilterFactory.java | 2 +-
.../standard/WordBreakTestUnicode_6_3_0.java | 5537 ++++++++++++++++++
.../generateJavaUnicodeWordBreakTest.pl | 232 +
.../lucene/analysis/standard/package.html | 26 +
.../SolrStopwordsCarrot2LexicalDataFactory.java | 4 +-
.../apache/solr/core/SolrResourceLoader.java | 10 +-
.../analysis/ManagedStopFilterFactory.java | 4 +-
.../DocumentAnalysisRequestHandlerTest.java | 8 +-
.../FieldAnalysisRequestHandlerTest.java | 10 +-
.../spelling/TestSuggestSpellingConverter.java | 2 +-
234 files changed, 10991 insertions(+), 10871 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 68716c6..c5a85d0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,6 +35,10 @@ Improvements
write-once architecture, possibly catching externally caused
issues sooner (Robert Muir, Mike McCandless)
+* LUCENE-7318: StandardAnalyzer has been moved from the analysis
+ module into core and is now the default analyzer in
+ IndexWriterConfig (Robert Muir, Mike McCandless)
+
Optimizations
* LUCENE-7330: Speed up conjunction queries. (Adrien Grand)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/build.xml b/lucene/analysis/common/build.xml
index 670e6ab..56ca4d0 100644
--- a/lucene/analysis/common/build.xml
+++ b/lucene/analysis/common/build.xml
@@ -33,7 +33,7 @@
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
- <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
+ <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
<target name="-jflex-HTMLStripCharFilter"
@@ -62,45 +62,14 @@
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
</target>
- <target name="-jflex-StandardAnalyzer" depends="init,-install-jflex">
- <run-jflex-and-disable-buffer-expansion
- dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
- <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
- </target>
-
<target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
<run-jflex-and-disable-buffer-expansion
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
- <macrodef name="run-jflex">
- <attribute name="dir"/>
- <attribute name="name"/>
- <sequential>
- <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
- </sequential>
- </macrodef>
-
- <macrodef name="run-jflex-and-disable-buffer-expansion">
- <attribute name="dir"/>
- <attribute name="name"/>
- <sequential>
- <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
- <!-- LUCENE-5897: Disallow scanner buffer expansion -->
- <replaceregexp file="@{dir}/@{name}.java"
- match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
- replace="" flags="s" />
- <replaceregexp file="@{dir}/@{name}.java"
- match="private static final int ZZ_BUFFERSIZE ="
- replace="private int ZZ_BUFFERSIZE ="/>
- <replaceregexp file="@{dir}/@{name}.java"
- match="int requested = zzBuffer.length - zzEndRead;"
- replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
- <replaceregexp file="@{dir}/@{name}.java"
- match="(zzFinalHighSurrogate = 1;)(\r?\n)"
- replace="\1\2 if (totalRead == 1) { return true; }\2"/>
- </sequential>
- </macrodef>
+ <target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
+ <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
+ </target>
<target name="clean-jflex">
<delete>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index 3d36c86..71da32d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -21,15 +21,15 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
index 24746e4..9cb0657 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -21,15 +21,15 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* {@link Analyzer} for Bulgarian.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 3b02567..5dd0cbc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
index cb674de..739b61a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.CatalanStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
index fea84d8..68a939b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@@ -24,8 +24,8 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index ed8eee6..d500ff9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -20,13 +20,13 @@ package org.apache.lucene.analysis.cjk;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
index 78304c7..5fd1bec 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
index 35dedde..75e991f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@@ -18,14 +18,14 @@ package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
/*
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index ebd5ec3..946003f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
import java.util.Map;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
@@ -82,4 +83,4 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
}
-
\ No newline at end of file
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 1920401..680e67a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -17,15 +17,15 @@
package org.apache.lucene.analysis.compound;
+import java.io.IOException;
+import java.util.LinkedList;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-
-import java.io.IOException;
-import java.util.LinkedList;
/**
* Base class for decomposition token filters.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
index b81a96c..2e4b837 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -19,7 +19,7 @@ package org.apache.lucene.analysis.compound;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
/**
* A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
index 440ab5e..d31cdf8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
@@ -17,15 +17,15 @@
package org.apache.lucene.analysis.compound;
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import java.util.Map;
-import java.io.IOException;
-
/**
* Factory for {@link DictionaryCompoundWordTokenFilter}.
* <pre class="prettyprint">
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index bef438c..41f92c9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -16,15 +16,14 @@
*/
package org.apache.lucene.analysis.compound;
+import java.io.IOException;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.xml.sax.InputSource;
-import java.io.IOException;
-
/**
* A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
*
@@ -82,7 +81,7 @@ public class HyphenationCompoundWordTokenFilter extends
/**
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
* <p>
- * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.util.CharArraySet, int, int, int, boolean)
+ * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
* null, minWordSize, minSubwordSize, maxSubwordSize }
*/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
index 9ffe405..37421bb 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
@@ -17,19 +17,18 @@
package org.apache.lucene.analysis.compound;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.IOUtils;
-
-import java.util.Map;
-import java.io.IOException;
-import java.io.InputStream;
-
import org.xml.sax.InputSource;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
deleted file mode 100644
index ade6a58..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
-
-/**
- * Normalizes token text to lower case.
- */
-public final class LowerCaseFilter extends TokenFilter {
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
- /**
- * Create a new LowerCaseFilter, that normalizes token text to lower case.
- *
- * @param in TokenStream to filter
- */
- public LowerCaseFilter(TokenStream in) {
- super(in);
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- if (input.incrementToken()) {
- CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
- return true;
- } else
- return false;
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java
index 785daa5..0bd9795 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java
@@ -20,7 +20,7 @@ package org.apache.lucene.analysis.core;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
index 45c8d23..d0fdcf6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index 0c8fdc8..3fa4982 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -20,13 +20,14 @@ package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.List;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
/**
* Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
@@ -35,19 +36,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
- public static final CharArraySet ENGLISH_STOP_WORDS_SET;
-
- static {
- final List<String> stopWords = Arrays.asList(
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it",
- "no", "not", "of", "on", "or", "such",
- "that", "the", "their", "then", "there", "these",
- "they", "this", "to", "was", "will", "with"
- );
- final CharArraySet stopSet = new CharArraySet(stopWords, false);
- ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
- }
+ public static final CharArraySet ENGLISH_STOP_WORDS_SET = StandardAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer which removes words in
* {@link #ENGLISH_STOP_WORDS_SET}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
deleted file mode 100644
index fc33a1c..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-
-/**
- * Removes stop words from a token stream.
- */
-public final class StopFilter extends FilteringTokenFilter {
-
- private final CharArraySet stopWords;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
- /**
- * Constructs a filter which removes words from the input TokenStream that are
- * named in the Set.
- *
- * @param in
- * Input stream
- * @param stopWords
- * A {@link CharArraySet} representing the stopwords.
- * @see #makeStopSet(java.lang.String...)
- */
- public StopFilter(TokenStream in, CharArraySet stopWords) {
- super(in);
- this.stopWords = stopWords;
- }
-
- /**
- * Builds a Set from an array of stop words,
- * appropriate for passing into the StopFilter constructor.
- * This permits this stopWords construction to be cached once when
- * an Analyzer is constructed.
- *
- * @param stopWords An array of stopwords
- * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
- */
- public static CharArraySet makeStopSet(String... stopWords) {
- return makeStopSet(stopWords, false);
- }
-
- /**
- * Builds a Set from an array of stop words,
- * appropriate for passing into the StopFilter constructor.
- * This permits this stopWords construction to be cached once when
- * an Analyzer is constructed.
- *
- * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
- * @return A Set ({@link CharArraySet}) containing the words
- * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
- */
- public static CharArraySet makeStopSet(List<?> stopWords) {
- return makeStopSet(stopWords, false);
- }
-
- /**
- * Creates a stopword set from the given stopword array.
- *
- * @param stopWords An array of stopwords
- * @param ignoreCase If true, all words are lower cased first.
- * @return a Set containing the words
- */
- public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
- CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
- stopSet.addAll(Arrays.asList(stopWords));
- return stopSet;
- }
-
- /**
- * Creates a stopword set from the given stopword list.
- * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
- * @param ignoreCase if true, all words are lower cased first
- * @return A Set ({@link CharArraySet}) containing the words
- */
- public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
- CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
- stopSet.addAll(stopWords);
- return stopSet;
- }
-
- /**
- * Returns the next input Token whose term() is not a stop word.
- */
- @Override
- protected boolean accept() {
- return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
index d3f6aff..17e2a89 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -17,15 +17,16 @@
package org.apache.lucene.analysis.core;
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.WordlistLoader; // jdocs
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
-
-import java.util.Map;
-import java.io.IOException;
/**
* Factory for {@link StopFilter}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
index d7447d6..cc1547c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
@@ -19,9 +19,9 @@ package org.apache.lucene.analysis.core;
import java.util.Set;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
/**
* Removes tokens whose types appear in a set of blocked types from a token stream.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
index 6d3f6bb..7b28997 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
@@ -19,10 +19,11 @@ package org.apache.lucene.analysis.core;
import java.io.IOException;
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Normalizes token text to UPPER CASE.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 6b664c3..9777179 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -17,22 +17,22 @@
package org.apache.lucene.analysis.cz;
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
-import java.io.*;
-import java.nio.charset.StandardCharsets;
-
/**
* {@link Analyzer} for Czech language.
* <p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
index 1b11a1c..f9c316d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.tartarus.snowball.ext.DanishStemmer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 23e01be..790fc48 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -23,18 +23,18 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index f039edb..c85b6ec 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -20,14 +20,14 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* {@link Analyzer} for the Greek language.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java
index c09cafa..75d0840 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java
@@ -16,7 +16,7 @@
*/
package org.apache.lucene.analysis.el;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import java.util.Arrays;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
index 721d9b2..16dc0c5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
@@ -20,16 +20,16 @@ package org.apache.lucene.analysis.en;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* {@link Analyzer} for English.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
index 3348d9a..f0bfecd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
@@ -55,7 +55,7 @@ the original shown below)
*/
package org.apache.lucene.analysis.en;
-import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* <p>Title: Kstemmer</p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
index 0e4747f..ab5b6c3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
index db83cfb..cff2da0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
@@ -21,16 +21,16 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.BasqueStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index 256c78b..2515d1e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -21,15 +21,15 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
index 4cc62db..6b00101 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.tartarus.snowball.ext.FinnishStemmer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 86088fd..5f90246 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -17,27 +17,27 @@
package org.apache.lucene.analysis.fr;
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
-import java.io.IOException;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-
/**
* {@link Analyzer} for French language.
* <p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
index 1e6d39a..1ca3455 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
@@ -22,16 +22,16 @@ import java.io.Reader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.IrishStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
index b9de3fa..372a6ec 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
@@ -22,16 +22,16 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
index 22e930b..1b57129 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
@@ -20,16 +20,16 @@ package org.apache.lucene.analysis.hi;
import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
index 31fe9e2..0615bdc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
@@ -22,17 +22,17 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.tartarus.snowball.ext.HungarianStemmer;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 748b3f1..7687d21 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -22,7 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
index 857117a..8c04639 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
@@ -21,16 +21,16 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.ArmenianStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
index f7be17f..fc9b4d2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
@@ -20,15 +20,15 @@ package org.apache.lucene.analysis.id;
import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
/**
* Analyzer for Indonesian (Bahasa)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index 27027fa..a18aa5d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -23,18 +23,18 @@ import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
index f0424c9..5e24cf9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
@@ -21,16 +21,16 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.tartarus.snowball.ext.LithuanianStemmer;
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
index b22339d..0a016af 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@@ -22,16 +22,16 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
/**