You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2022/01/05 14:42:52 UTC
[lucene] branch branch_9x updated: LUCENE-10352: Convert TestAllAnalyzersHaveFactories and TestRandomChains to a global integration test and discover classes to check from module system (#582)
This is an automated email from the ASF dual-hosted git repository.
uschindler pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 7525941 LUCENE-10352: Convert TestAllAnalyzersHaveFactories and TestRandomChains to a global integration test and discover classes to check from module system (#582)
7525941 is described below
commit 75259417f1b8de05eda4cf3a8b8c5e8177c7f0dd
Author: Uwe Schindler <us...@apache.org>
AuthorDate: Wed Jan 5 15:35:02 2022 +0100
LUCENE-10352: Convert TestAllAnalyzersHaveFactories and TestRandomChains to a global integration test and discover classes to check from module system (#582)
Co-authored-by: Robert Muir <rm...@apache.org>
---
gradle/java/modules.gradle | 2 +-
gradle/validation/rat-sources.gradle | 1 +
lucene/CHANGES.txt | 13 +
.../phonetic => analysis.tests}/build.gradle | 21 +-
.../src/test}/module-info.java | 26 +-
.../analysis/tests/ModuleClassDiscovery.java | 95 ++
.../tests}/TestAllAnalyzersHaveFactories.java | 90 +-
.../lucene/analysis/tests/TestRandomChains.java | 961 ++++++++++++++++
.../org/apache/lucene/analysis/tests/da_UTF8.xml | 1208 ++++++++++++++++++++
.../org/apache/lucene/analysis/tests/simple.aff | 20 +
.../org/apache/lucene/analysis/tests/simple.dic | 11 +
.../analysis/boost/DelimitedBoostTokenFilter.java | 3 +
.../lucene/analysis/cjk/CJKBigramFilter.java | 2 +
.../analysis/commongrams/CommonGramsFilter.java | 6 +-
.../commongrams/CommonGramsQueryFilter.java | 2 +
.../lucene/analysis/core/LowerCaseFilter.java | 2 +
.../apache/lucene/analysis/core/StopFilter.java | 2 +
.../DelimitedTermFrequencyTokenFilter.java | 3 +
.../miscellaneous/HyphenatedWordsFilter.java | 3 +
.../miscellaneous/LimitTokenCountFilter.java | 2 +
.../miscellaneous/LimitTokenOffsetFilter.java | 2 +
.../miscellaneous/LimitTokenPositionFilter.java | 2 +
.../miscellaneous/WordDelimiterGraphFilter.java | 2 +
.../analysis/path/PathHierarchyTokenizer.java | 2 +
.../path/ReversePathHierarchyTokenizer.java | 2 +
.../analysis/wikipedia/WikipediaTokenizer.java | 2 +
.../lucene/analysis/core/TestBugInSomething.java | 67 +-
.../apache/lucene/analysis/core/TestFactories.java | 2 +-
.../lucene/analysis/core/TestRandomChains.java | 1045 -----------------
.../TestKeywordMarkerFilterFactory.java | 2 +-
.../TestStemmerOverrideFilterFactory.java | 2 +-
.../pattern/TestPatternTypingFilterFactory.java | 2 +-
.../snowball/TestSnowballPorterFilterFactory.java | 2 +-
.../analysis/synonym/TestMultiWordSynonyms.java | 2 +-
.../analysis/synonym/TestSynonymFilterFactory.java | 2 +-
.../util/TestFilesystemResourceLoader.java | 1 +
.../analysis/ja/JapaneseCompletionFilter.java | 2 +
.../ja/JapaneseIterationMarkCharFilter.java | 3 +
.../analysis/ja/JapaneseKatakanaStemFilter.java | 3 +
.../lucene/analysis/ja/JapaneseNumberFilter.java | 2 +
.../lucene/analysis/ja/JapaneseTokenizer.java | 2 +
.../analysis/ja/StringMockResourceLoader.java | 46 -
.../apache/lucene/analysis/ja/TestFactories.java | 1 +
.../ja/TestJapaneseBaseFormFilterFactory.java | 1 +
...TestJapaneseIterationMarkCharFilterFactory.java | 1 +
.../ja/TestJapaneseKatakanaStemFilterFactory.java | 1 +
.../ja/TestJapaneseNumberFilterFactory.java | 1 +
.../TestJapanesePartOfSpeechStopFilterFactory.java | 1 +
.../ja/TestJapaneseReadingFormFilterFactory.java | 1 +
.../analysis/ja/TestJapaneseTokenizerFactory.java | 1 +
.../analysis/morfologik/MorfologikFilter.java | 2 +
lucene/analysis/nori/src/java/module-info.java | 1 +
.../lucene/analysis/ko/KoreanNumberFilter.java | 2 +
.../apache/lucene/analysis/ko/KoreanTokenizer.java | 3 +
.../org.apache.lucene.analysis.TokenFilterFactory | 1 +
.../analysis/ko/StringMockResourceLoader.java | 46 -
.../analysis/ko/TestKoreanNumberFilterFactory.java | 1 +
.../TestKoreanPartOfSpeechStopFilterFactory.java | 1 +
.../ko/TestKoreanReadingFormFilterFactory.java | 1 +
.../analysis/ko/TestKoreanTokenizerFactory.java | 1 +
.../analysis/opennlp/OpenNLPChunkerFilter.java | 2 +
.../analysis/opennlp/OpenNLPLemmatizerFilter.java | 2 +
.../lucene/analysis/opennlp/OpenNLPPOSFilter.java | 2 +
.../lucene/analysis/opennlp/OpenNLPTokenizer.java | 2 +
lucene/analysis/phonetic/build.gradle | 2 +-
lucene/analysis/phonetic/src/java/module-info.java | 1 +
.../analysis/phonetic/BeiderMorseFilter.java | 4 +
.../analysis/phonetic/DoubleMetaphoneFilter.java | 3 +
.../org.apache.lucene.analysis.TokenFilterFactory | 1 +
.../apache/lucene/analysis/CachingTokenFilter.java | 4 +
.../apache/lucene/util/IgnoreRandomChains.java} | 28 +-
.../tests}/util/StringMockResourceLoader.java | 2 +-
settings.gradle | 1 +
73 files changed, 2539 insertions(+), 1252 deletions(-)
diff --git a/gradle/java/modules.gradle b/gradle/java/modules.gradle
index 0855423..5e334ab 100644
--- a/gradle/java/modules.gradle
+++ b/gradle/java/modules.gradle
@@ -214,7 +214,7 @@ allprojects {
}
// Configure (tasks.test, sourceSets.test)
- tasks.matching { it.name == "test" }.all { Test task ->
+ tasks.matching { it.name ==~ /test(_[0-9]+)?/ }.all { Test task ->
configureTestTaskForSourceSet(task, task.project.sourceSets.test)
}
diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle
index 2acc34e..622255b 100644
--- a/gradle/validation/rat-sources.gradle
+++ b/gradle/validation/rat-sources.gradle
@@ -102,6 +102,7 @@ allprojects {
break
case ":lucene:analysis:common":
+ case ":lucene:analysis.tests":
exclude "src/**/*.aff"
exclude "src/**/*.dic"
exclude "src/**/*.good"
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index d11c51e..8eaf5cb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -118,6 +118,12 @@ Bug Fixes
* LUCENE-10349: Fix all analyzers to behave according to their documentation:
getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler)
+* LUCENE-10352: Add missing service provider entries: KoreanNumberFilterFactory,
+ DaitchMokotoffSoundexFilterFactory (Uwe Schindler, Robert Muir)
+
+* LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter,
+ DoubleMetaphoneFilter (Uwe Schindler, Robert Muir)
+
Other
---------------------
@@ -128,6 +134,13 @@ Other
* LUCENE-10310: TestXYDocValuesQueries#doRandomDistanceTest does not produce random circles with radius
with '0' value any longer.
+* LUCENE-10352: Removed duplicate instances of StringMockResourceLoader and migrated class to
+ test-framework. (Uwe Schindler, Robert Muir)
+
+* LUCENE-10352: Convert TestAllAnalyzersHaveFactories and TestRandomChains to a global integration test
+ and discover classes to check from module system. The test now checks all analyzer modules,
+ so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir)
+
======================= Lucene 9.0.0 =======================
New Features
diff --git a/lucene/analysis/phonetic/build.gradle b/lucene/analysis.tests/build.gradle
similarity index 54%
copy from lucene/analysis/phonetic/build.gradle
copy to lucene/analysis.tests/build.gradle
index e5595cb..be1c51f 100644
--- a/lucene/analysis/phonetic/build.gradle
+++ b/lucene/analysis.tests/build.gradle
@@ -17,14 +17,17 @@
apply plugin: 'java-library'
-description = 'Analyzer for indexing phonetic signatures (for sounds-alike search)'
+description = 'Module integration tests for all :lucene:analysis modules'
dependencies {
- moduleApi project(':lucene:core')
- moduleApi project(':lucene:analysis:common')
-
- moduleImplementation 'commons-codec:commons-codec'
-
- testImplementation project(':lucene:test-framework')
-}
-
+ moduleTestImplementation project(':lucene:analysis:common')
+ moduleTestImplementation project(':lucene:analysis:icu')
+ moduleTestImplementation project(':lucene:analysis:kuromoji')
+ moduleTestImplementation project(':lucene:analysis:morfologik')
+ moduleTestImplementation project(':lucene:analysis:nori')
+ moduleTestImplementation project(':lucene:analysis:opennlp')
+ moduleTestImplementation project(':lucene:analysis:phonetic')
+ moduleTestImplementation project(':lucene:analysis:smartcn')
+ moduleTestImplementation project(':lucene:analysis:stempel')
+ moduleTestImplementation project(':lucene:test-framework')
+}
diff --git a/lucene/analysis/phonetic/src/java/module-info.java b/lucene/analysis.tests/src/test/module-info.java
similarity index 58%
copy from lucene/analysis/phonetic/src/java/module-info.java
copy to lucene/analysis.tests/src/test/module-info.java
index 706251a..5026116 100644
--- a/lucene/analysis/phonetic/src/java/module-info.java
+++ b/lucene/analysis.tests/src/test/module-info.java
@@ -15,17 +15,25 @@
* limitations under the License.
*/
-/** Analyzer for indexing phonetic signatures */
+/**
+ * Test module for global integration tests of all {@code org.apache.lucene.analysis}
+ * packages/modules.
+ */
@SuppressWarnings({"requires-automatic"})
-module org.apache.lucene.analysis.phonetic {
- requires org.apache.commons.codec;
+module org.apache.lucene.analysis.tests {
+ requires java.xml;
requires org.apache.lucene.core;
requires org.apache.lucene.analysis.common;
+ requires org.apache.lucene.analysis.icu;
+ requires org.apache.lucene.analysis.kuromoji;
+ requires org.apache.lucene.analysis.morfologik;
+ requires org.apache.lucene.analysis.nori;
+ requires org.apache.lucene.analysis.opennlp;
+ requires org.apache.lucene.analysis.phonetic;
+ requires org.apache.lucene.analysis.smartcn;
+ requires org.apache.lucene.analysis.stempel;
+ requires org.apache.lucene.test_framework;
+ requires junit;
- exports org.apache.lucene.analysis.phonetic;
-
- provides org.apache.lucene.analysis.TokenFilterFactory with
- org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory,
- org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory,
- org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
+ exports org.apache.lucene.analysis.tests;
}
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java
new file mode 100644
index 0000000..28b90e5
--- /dev/null
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/ModuleClassDiscovery.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.tests;
+
+import java.io.IOException;
+import java.lang.module.ResolvedModule;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.function.Predicate;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.junit.Assert;
+
+/** Discovers all classes from the module graph and loads them (without initialization) */
+abstract class ModuleClassDiscovery {
+
+ private static final Module THIS_MODULE = ModuleClassDiscovery.class.getModule();
+ private static final ModuleLayer LAYER = THIS_MODULE.getLayer();
+ private static final SortedMap<String, ResolvedModule> ALL_ANALYSIS_MODULES;
+
+ private static final Predicate<String> ALLOW_MODULES =
+ name ->
+ name.equals("org.apache.lucene.core") || name.startsWith("org.apache.lucene.analysis.");
+
+ static {
+ Assert.assertTrue(
+ "Analysis integration tests must run in Java Module System as named module",
+ THIS_MODULE.isNamed());
+ Assert.assertNotNull("Module layer is missing", LAYER);
+
+ var mods = new TreeMap<String, ResolvedModule>();
+ discoverAnalysisModules(LAYER, mods);
+ ALL_ANALYSIS_MODULES = Collections.unmodifiableSortedMap(mods);
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println(
+ "Discovered the following analysis modules: " + ALL_ANALYSIS_MODULES.keySet());
+ }
+ }
+
+ private static void discoverAnalysisModules(
+ ModuleLayer layer, Map<String, ResolvedModule> result) {
+ for (var mod : layer.configuration().modules()) {
+ String name = mod.name();
+ if (ALLOW_MODULES.test(name) && !Objects.equals(name, THIS_MODULE.getName())) {
+ result.put(name, mod);
+ }
+ }
+ for (var parent : layer.parents()) {
+ discoverAnalysisModules(parent, result);
+ }
+ }
+
+ /** Finds all classes in package across all analysis modules */
+ public static List<Class<?>> getClassesForPackage(String pkgname) throws IOException {
+ final var prefix = pkgname.concat(".");
+ final var classes = new ArrayList<Class<?>>();
+ for (var resolvedModule : ALL_ANALYSIS_MODULES.values()) {
+ final var module = LAYER.findModule(resolvedModule.name()).orElseThrow();
+ try (var reader = resolvedModule.reference().open()) {
+ reader
+ .list()
+ .filter(entry -> entry.endsWith(".class"))
+ .map(entry -> entry.substring(0, entry.length() - 6).replace('/', '.'))
+ .filter(clazzname -> clazzname.startsWith(prefix))
+ .sorted()
+ .map(
+ clazzname ->
+ Objects.requireNonNull(
+ Class.forName(module, clazzname),
+ "Class '" + clazzname + "' not found in module '" + module.getName() + "'"))
+ .forEach(classes::add);
+ }
+ }
+ Assert.assertFalse("No classes found in package:" + pkgname, classes.isEmpty());
+ return classes;
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java
similarity index 69%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
rename to lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java
index 945177b..c7df6e1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestAllAnalyzersHaveFactories.java
@@ -14,15 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.analysis.core;
+package org.apache.lucene.analysis.tests;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Modifier;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.HashSet;
-import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -34,27 +31,17 @@ import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenizerFactory;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.sr.SerbianNormalizationRegularFilter;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
-import org.apache.lucene.tests.analysis.CrankyTokenFilter;
-import org.apache.lucene.tests.analysis.MockCharFilter;
-import org.apache.lucene.tests.analysis.MockFixedLengthPayloadFilter;
-import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
-import org.apache.lucene.tests.analysis.MockHoleInjectingTokenFilter;
-import org.apache.lucene.tests.analysis.MockLowerCaseFilter;
-import org.apache.lucene.tests.analysis.MockRandomLookaheadTokenFilter;
-import org.apache.lucene.tests.analysis.MockSynonymFilter;
-import org.apache.lucene.tests.analysis.MockTokenFilter;
-import org.apache.lucene.tests.analysis.MockTokenizer;
-import org.apache.lucene.tests.analysis.MockVariableLengthPayloadFilter;
-import org.apache.lucene.tests.analysis.SimplePayloadFilter;
-import org.apache.lucene.tests.analysis.ValidatingTokenFilter;
+import org.apache.lucene.analysis.stempel.StempelFilter;
import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.util.Version;
@@ -65,71 +52,37 @@ import org.apache.lucene.util.Version;
*/
public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
- // these are test-only components (e.g. test-framework)
- private static final Set<Class<?>> testComponents =
- Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
-
- static {
- Collections.<Class<?>>addAll(
- testComponents,
- MockTokenizer.class,
- MockCharFilter.class,
- MockFixedLengthPayloadFilter.class,
- MockGraphTokenFilter.class,
- MockHoleInjectingTokenFilter.class,
- MockLowerCaseFilter.class,
- MockRandomLookaheadTokenFilter.class,
- MockSynonymFilter.class,
- MockTokenFilter.class,
- MockVariableLengthPayloadFilter.class,
- ValidatingTokenFilter.class,
- CrankyTokenFilter.class,
- SimplePayloadFilter.class);
- }
-
// these are 'crazy' components like cachingtokenfilter. does it make sense to add factories for
// these?
private static final Set<Class<?>> crazyComponents =
- Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
-
- static {
- Collections.<Class<?>>addAll(
- crazyComponents, CachingTokenFilter.class, TeeSinkTokenFilter.class);
- }
+ Set.of(CachingTokenFilter.class, TeeSinkTokenFilter.class);
// these are oddly-named (either the actual analyzer, or its factory)
// they do actually have factories.
// TODO: clean this up!
private static final Set<Class<?>> oddlyNamedComponents =
- Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
-
- static {
- Collections.<Class<?>>addAll(
- oddlyNamedComponents,
- // this is supported via an option to PathHierarchyTokenizer's factory
- ReversePathHierarchyTokenizer.class,
- SnowballFilter.class, // this is called SnowballPorterFilterFactory
- PatternKeywordMarkerFilter.class,
- SetKeywordMarkerFilter.class,
- UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory
- // class from core, but StopFilterFactory creates one from this module
- org.apache.lucene.analysis.StopFilter.class,
- // class from core, but LowerCaseFilterFactory creates one from this module
- org.apache.lucene.analysis.LowerCaseFilter.class);
- }
+ Set.of(
+ // this is supported via an option to PathHierarchyTokenizer's factory
+ ReversePathHierarchyTokenizer.class,
+ SnowballFilter.class, // this is called SnowballPorterFilterFactory
+ StempelFilter.class, // this is called StempelPolishStemFilterFactory
+ PatternKeywordMarkerFilter.class,
+ SetKeywordMarkerFilter.class,
+ UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory
+ // class from core, but StopFilterFactory creates one from this module
+ org.apache.lucene.analysis.StopFilter.class,
+ // class from core, but LowerCaseFilterFactory creates one from this module
+ org.apache.lucene.analysis.LowerCaseFilter.class);
// The following token filters are excused from having their factory.
- private static final Set<Class<?>> tokenFiltersWithoutFactory = new HashSet<>();
-
- static {
- tokenFiltersWithoutFactory.add(SerbianNormalizationRegularFilter.class);
- }
+ private static final Set<Class<?>> tokenFiltersWithoutFactory =
+ Set.of(SerbianNormalizationRegularFilter.class);
private static final ResourceLoader loader = new StringMockResourceLoader("");
public void test() throws Exception {
List<Class<?>> analysisClasses =
- TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
+ ModuleClassDiscovery.getClassesForPackage("org.apache.lucene.analysis");
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
@@ -141,7 +94,6 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
|| c.isAnonymousClass()
|| c.isMemberClass()
|| c.isInterface()
- || testComponents.contains(c)
|| crazyComponents.contains(c)
|| oddlyNamedComponents.contains(c)
|| tokenFiltersWithoutFactory.contains(c)
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java
new file mode 100644
index 0000000..208c882
--- /dev/null
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java
@@ -0,0 +1,961 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.tests;
+
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.Transliterator;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Modifier;
+import java.text.DateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.commons.codec.Encoder;
+import org.apache.commons.codec.language.Caverphone2;
+import org.apache.commons.codec.language.ColognePhonetic;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Nysiis;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.commons.codec.language.bm.PhoneticEngine;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig;
+import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.minhash.MinHashFilter;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+import org.apache.lucene.analysis.pattern.PatternTypingFilter;
+import org.apache.lucene.analysis.payloads.IdentityEncoder;
+import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.lucene.analysis.pl.PolishAnalyzer;
+import org.apache.lucene.analysis.shingle.FixedShingleFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.stempel.StempelStemmer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenFilter;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+import org.apache.lucene.tests.analysis.ValidatingTokenFilter;
+import org.apache.lucene.tests.util.Rethrow;
+import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IgnoreRandomChains;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.tartarus.snowball.SnowballStemmer;
+import org.xml.sax.InputSource;
+
+/** tests random analysis chains */
+public class TestRandomChains extends BaseTokenStreamTestCase {
+
+ static List<Constructor<? extends Tokenizer>> tokenizers;
+ static List<Constructor<? extends TokenFilter>> tokenfilters;
+ static List<Constructor<? extends CharFilter>> charfilters;
+
+ static List<Class<? extends SnowballStemmer>> snowballStemmers;
+
+ private static final Set<Class<?>> avoidConditionals =
+ Set.of(
+ FingerprintFilter.class,
+ MinHashFilter.class,
+ ConcatenateGraphFilter.class,
+ // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can
+ // expose inconsistent offsets
+ // https://issues.apache.org/jira/browse/LUCENE-4170
+ ShingleFilter.class,
+ FixedShingleFilter.class,
+ // FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition
+ // can break position lengths
+ FlattenGraphFilter.class,
+ // LimitToken*Filters don't set end offsets correctly
+ LimitTokenOffsetFilter.class,
+ LimitTokenCountFilter.class,
+ LimitTokenPositionFilter.class);
+
+ private static final Map<Constructor<?>, Predicate<Object[]>> brokenConstructors;
+
+ static {
+ try {
+ final Map<Constructor<?>, Predicate<Object[]>> map = new HashMap<>();
+ // LimitToken*Filter can only use special ctor when last arg is true
+ for (final var c :
+ List.of(
+ LimitTokenCountFilter.class,
+ LimitTokenOffsetFilter.class,
+ LimitTokenPositionFilter.class)) {
+ map.put(
+ c.getConstructor(TokenStream.class, int.class, boolean.class),
+ args -> {
+ assert args.length == 3;
+ return false == ((Boolean) args[2]); // args are broken if consumeAllTokens is false
+ });
+ }
+ brokenConstructors = Collections.unmodifiableMap(map);
+ } catch (Exception e) {
+ throw new Error(e);
+ }
+ }
+
+ private static final Map<Class<?>, Function<Random, Object>> argProducers =
+ Collections.unmodifiableMap(
+ new IdentityHashMap<Class<?>, Function<Random, Object>>() {
+ {
+ put(
+ int.class,
+ random -> {
+ // TODO: could cause huge ram usage to use full int range for some filters
+ // (e.g. allocate enormous arrays)
+ // return Integer.valueOf(random.nextInt());
+ return Integer.valueOf(TestUtil.nextInt(random, -50, 50));
+ });
+ put(
+ char.class,
+ random -> {
+ // TODO: fix any filters that care to throw IAE instead.
+ // also add a unicode validating filter to validate termAtt?
+ // return Character.valueOf((char)random.nextInt(65536));
+ while (true) {
+ char c = (char) random.nextInt(65536);
+ if (c < '\uD800' || c > '\uDFFF') {
+ return Character.valueOf(c);
+ }
+ }
+ });
+ put(float.class, Random::nextFloat);
+ put(boolean.class, Random::nextBoolean);
+ put(byte.class, random -> (byte) random.nextInt(256));
+ put(
+ byte[].class,
+ random -> {
+ byte[] bytes = new byte[random.nextInt(256)];
+ random.nextBytes(bytes);
+ return bytes;
+ });
+ put(Random.class, random -> new Random(random.nextLong()));
+ put(Version.class, random -> Version.LATEST);
+ put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory);
+ put(AttributeSource.class, random -> null); // force IAE/NPE
+ put(
+ Set.class,
+ random -> {
+ // TypeTokenFilter
+ Set<String> set = new HashSet<>();
+ int num = random.nextInt(5);
+ for (int i = 0; i < num; i++) {
+ set.add(
+ StandardTokenizer.TOKEN_TYPES[
+ random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
+ }
+ return set;
+ });
+ put(
+ Collection.class,
+ random -> {
+ // CapitalizationFilter
+ Collection<char[]> col = new ArrayList<>();
+ int num = random.nextInt(5);
+ for (int i = 0; i < num; i++) {
+ col.add(TestUtil.randomSimpleString(random).toCharArray());
+ }
+ return col;
+ });
+ put(
+ CharArraySet.class,
+ random -> {
+ int num = random.nextInt(10);
+ CharArraySet set = new CharArraySet(num, random.nextBoolean());
+ for (int i = 0; i < num; i++) {
+ // TODO: make nastier
+ set.add(TestUtil.randomSimpleString(random));
+ }
+ return set;
+ });
+ // TODO: don't want to make the exponentially slow ones Dawid documents
+ // in TestPatternReplaceFilter, so dont use truly random patterns (for now)
+ put(Pattern.class, random -> Pattern.compile("a"));
+ put(
+ Pattern[].class,
+ random ->
+ new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")});
+ put(
+ PayloadEncoder.class,
+ random ->
+ new IdentityEncoder()); // the other encoders will throw exceptions if tokens
+ // arent numbers?
+ put(
+ Dictionary.class,
+ random -> {
+ // TODO: make nastier
+ InputStream affixStream =
+ TestRandomChains.class.getResourceAsStream("simple.aff");
+ InputStream dictStream =
+ TestRandomChains.class.getResourceAsStream("simple.dic");
+ try {
+ return new Dictionary(
+ new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ });
+ put(
+ HyphenationTree.class,
+ random -> {
+ // TODO: make nastier
+ try {
+ InputSource is =
+ new InputSource(
+ TestRandomChains.class.getResource("da_UTF8.xml").toExternalForm());
+ HyphenationTree hyphenator =
+ HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+ return hyphenator;
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ });
+ put(
+ SnowballStemmer.class,
+ random -> {
+ try {
+ var clazz = snowballStemmers.get(random.nextInt(snowballStemmers.size()));
+ return clazz.getConstructor().newInstance();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ });
+ put(
+ String.class,
+ random -> {
+ // TODO: make nastier
+ if (random.nextBoolean()) {
+ // a token type
+ return StandardTokenizer.TOKEN_TYPES[
+ random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
+ } else {
+ return TestUtil.randomSimpleString(random);
+ }
+ });
+ put(
+ NormalizeCharMap.class,
+ random -> {
+ NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ // we can't add duplicate keys, or NormalizeCharMap gets angry
+ Set<String> keys = new HashSet<>();
+ int num = random.nextInt(5);
+ // System.out.println("NormalizeCharMap=");
+ for (int i = 0; i < num; i++) {
+ String key = TestUtil.randomSimpleString(random);
+ if (!keys.contains(key) && key.length() > 0) {
+ String value = TestUtil.randomSimpleString(random);
+ builder.add(key, value);
+ keys.add(key);
+ // System.out.println("mapping: '" + key + "' => '" + value + "'");
+ }
+ }
+ return builder.build();
+ });
+ put(
+ CharacterRunAutomaton.class,
+ random -> {
+ // TODO: could probably use a purely random automaton
+ switch (random.nextInt(5)) {
+ case 0:
+ return MockTokenizer.KEYWORD;
+ case 1:
+ return MockTokenizer.SIMPLE;
+ case 2:
+ return MockTokenizer.WHITESPACE;
+ case 3:
+ return MockTokenFilter.EMPTY_STOPSET;
+ default:
+ return MockTokenFilter.ENGLISH_STOPSET;
+ }
+ });
+ put(
+ CharArrayMap.class,
+ random -> {
+ int num = random.nextInt(10);
+ CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean());
+ for (int i = 0; i < num; i++) {
+ // TODO: make nastier
+ map.put(
+ TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random));
+ }
+ return map;
+ });
+ put(
+ StemmerOverrideMap.class,
+ random -> {
+ int num = random.nextInt(10);
+ StemmerOverrideFilter.Builder builder =
+ new StemmerOverrideFilter.Builder(random.nextBoolean());
+ for (int i = 0; i < num; i++) {
+ String input = "";
+ do {
+ input = TestUtil.randomRealisticUnicodeString(random);
+ } while (input.isEmpty());
+ String out = "";
+ TestUtil.randomSimpleString(random);
+ do {
+ out = TestUtil.randomRealisticUnicodeString(random);
+ } while (out.isEmpty());
+ builder.add(input, out);
+ }
+ try {
+ return builder.build();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ });
+ put(
+ SynonymMap.class,
+ new Function<Random, Object>() {
+ @Override
+ public Object apply(Random random) {
+ SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ addSyn(
+ b,
+ randomNonEmptyString(random),
+ randomNonEmptyString(random),
+ random.nextBoolean());
+ }
+ try {
+ return b.build();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+
+ private void addSyn(
+ SynonymMap.Builder b, String input, String output, boolean keepOrig) {
+ b.add(
+ new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+
+ private String randomNonEmptyString(Random random) {
+ while (true) {
+ final String s = TestUtil.randomUnicodeString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+ });
+ put(
+ DateFormat.class,
+ random -> {
+ if (random.nextBoolean()) return null;
+ return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
+ });
+ put(
+ Automaton.class,
+ random -> {
+ return Operations.determinize(
+ new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE)
+ .toAutomaton(),
+ Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+ });
+ put(
+ PatternTypingFilter.PatternTypingRule[].class,
+ random -> {
+ int numRules = TestUtil.nextInt(random, 1, 3);
+ PatternTypingFilter.PatternTypingRule[] patternTypingRules =
+ new PatternTypingFilter.PatternTypingRule[numRules];
+ for (int i = 0; i < patternTypingRules.length; i++) {
+ String s = TestUtil.randomSimpleString(random, 1, 2);
+ // random regex with one group
+ String regex = s + "(.*)";
+ // pattern rule with a template that accepts one group.
+ patternTypingRules[i] =
+ new PatternTypingFilter.PatternTypingRule(
+ Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1");
+ }
+ return patternTypingRules;
+ });
+
+ // ICU:
+ put(
+ Normalizer2.class,
+ random -> {
+ switch (random.nextInt(5)) {
+ case 0:
+ return Normalizer2.getNFCInstance();
+ case 1:
+ return Normalizer2.getNFDInstance();
+ case 2:
+ return Normalizer2.getNFKCInstance();
+ case 3:
+ return Normalizer2.getNFKDInstance();
+ default:
+ return Normalizer2.getNFKCCasefoldInstance();
+ }
+ });
+ final var icuTransliterators = Collections.list(Transliterator.getAvailableIDs());
+ Collections.sort(icuTransliterators);
+ put(
+ Transliterator.class,
+ random ->
+ Transliterator.getInstance(
+ icuTransliterators.get(random.nextInt(icuTransliterators.size()))));
+ put(
+ ICUTokenizerConfig.class,
+ random ->
+ new DefaultICUTokenizerConfig(random.nextBoolean(), random.nextBoolean()));
+
+ // Kuromoji:
+ final var jaComplFilterModes = JapaneseCompletionFilter.Mode.values();
+ put(
+ JapaneseCompletionFilter.Mode.class,
+ random -> jaComplFilterModes[random.nextInt(jaComplFilterModes.length)]);
+ final var jaTokModes = JapaneseTokenizer.Mode.values();
+ put(
+ JapaneseTokenizer.Mode.class,
+ random -> jaTokModes[random.nextInt(jaTokModes.length)]);
+ put(org.apache.lucene.analysis.ja.dict.UserDictionary.class, random -> null);
+
+ // Nori:
+ final var koComplFilterModes = KoreanTokenizer.DecompoundMode.values();
+ put(
+ KoreanTokenizer.DecompoundMode.class,
+ random -> koComplFilterModes[random.nextInt(koComplFilterModes.length)]);
+ put(org.apache.lucene.analysis.ko.dict.UserDictionary.class, random -> null);
+
+ // Phonetic:
+ final var bmNameTypes = org.apache.commons.codec.language.bm.NameType.values();
+ final var bmRuleTypes =
+ Stream.of(org.apache.commons.codec.language.bm.RuleType.values())
+ .filter(e -> e != org.apache.commons.codec.language.bm.RuleType.RULES)
+ .toArray(org.apache.commons.codec.language.bm.RuleType[]::new);
+ put(
+ PhoneticEngine.class,
+ random ->
+ new PhoneticEngine(
+ bmNameTypes[random.nextInt(bmNameTypes.length)],
+ bmRuleTypes[random.nextInt(bmRuleTypes.length)],
+ random.nextBoolean()));
+ put(
+ Encoder.class,
+ random -> {
+ switch (random.nextInt(7)) {
+ case 0:
+ return new DoubleMetaphone();
+ case 1:
+ return new Metaphone();
+ case 2:
+ return new Soundex();
+ case 3:
+ return new RefinedSoundex();
+ case 4:
+ return new Caverphone2();
+ case 5:
+ return new ColognePhonetic();
+ default:
+ return new Nysiis();
+ }
+ });
+
+ // Stempel
+ put(
+ StempelStemmer.class,
+ random -> new StempelStemmer(PolishAnalyzer.getDefaultTable()));
+ }
+ });
+
+ static final Set<Class<?>> allowedTokenizerArgs = argProducers.keySet(),
+ allowedTokenFilterArgs =
+ union(argProducers.keySet(), List.of(TokenStream.class, CommonGramsFilter.class)),
+ allowedCharFilterArgs = union(argProducers.keySet(), List.of(Reader.class));
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ List<Class<?>> analysisClasses =
+ ModuleClassDiscovery.getClassesForPackage("org.apache.lucene.analysis");
+ tokenizers = new ArrayList<>();
+ tokenfilters = new ArrayList<>();
+ charfilters = new ArrayList<>();
+ for (final Class<?> c : analysisClasses) {
+ final int modifiers = c.getModifiers();
+ if (
+ // don't waste time with abstract classes, deprecated, or @IgnoreRandomChains annotated
+ // classes:
+ Modifier.isAbstract(modifiers)
+ || !Modifier.isPublic(modifiers)
+ || c.isSynthetic()
+ || c.isAnonymousClass()
+ || c.isMemberClass()
+ || c.isInterface()
+ || c.isAnnotationPresent(Deprecated.class)
+ || c.isAnnotationPresent(IgnoreRandomChains.class)
+ || !(Tokenizer.class.isAssignableFrom(c)
+ || TokenFilter.class.isAssignableFrom(c)
+ || CharFilter.class.isAssignableFrom(c))) {
+ continue;
+ }
+
+ for (final Constructor<?> ctor : c.getConstructors()) {
+ // don't test synthetic, deprecated, or @IgnoreRandomChains annotated ctors, they likely
+ // have known bugs:
+ if (ctor.isSynthetic()
+ || ctor.isAnnotationPresent(Deprecated.class)
+ || ctor.isAnnotationPresent(IgnoreRandomChains.class)) {
+ continue;
+ }
+ // conditional filters are tested elsewhere
+ if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
+ continue;
+ }
+ if (Tokenizer.class.isAssignableFrom(c)) {
+ assertTrue(
+ ctor.toGenericString() + " has unsupported parameter types",
+ allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ tokenizers.add(castConstructor(Tokenizer.class, ctor));
+ } else if (TokenFilter.class.isAssignableFrom(c)) {
+ assertTrue(
+ ctor.toGenericString() + " has unsupported parameter types",
+ allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ tokenfilters.add(castConstructor(TokenFilter.class, ctor));
+ } else if (CharFilter.class.isAssignableFrom(c)) {
+ assertTrue(
+ ctor.toGenericString() + " has unsupported parameter types",
+ allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ charfilters.add(castConstructor(CharFilter.class, ctor));
+ } else {
+ fail("Cannot get here");
+ }
+ }
+ }
+
+ final Comparator<Constructor<?>> ctorComp = Comparator.comparing(Constructor::toGenericString);
+ Collections.sort(tokenizers, ctorComp);
+ Collections.sort(tokenfilters, ctorComp);
+ Collections.sort(charfilters, ctorComp);
+ if (VERBOSE) {
+ System.out.println("tokenizers = " + tokenizers);
+ System.out.println("tokenfilters = " + tokenfilters);
+ System.out.println("charfilters = " + charfilters);
+ }
+
+ // TODO: Eclipse does not get that cast right, so make explicit:
+ final Function<Class<?>, Class<? extends SnowballStemmer>> stemmerCast =
+ c -> c.asSubclass(SnowballStemmer.class);
+ snowballStemmers =
+ ModuleClassDiscovery.getClassesForPackage("org.tartarus.snowball.ext").stream()
+ .filter(c -> c.getName().endsWith("Stemmer"))
+ .map(stemmerCast)
+ .sorted(Comparator.comparing(Class::getName))
+ .collect(Collectors.toList());
+ if (VERBOSE) {
+ System.out.println("snowballStemmers = " + snowballStemmers);
+ }
+ }
+
+ @AfterClass
+ public static void afterClass() {
+ tokenizers = null;
+ tokenfilters = null;
+ charfilters = null;
+ snowballStemmers = null;
+ }
+
+ /** Creates a static/unmodifiable set from 2 collections as union. */
+ private static <T> Set<T> union(Collection<T> c1, Collection<T> c2) {
+ return Stream.concat(c1.stream(), c2.stream()).collect(Collectors.toUnmodifiableSet());
+ }
+
+ /**
+ * Hack to work around the stupidness of Oracle's strict Java backwards compatibility. {@code
+ * Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array!
+ */
+ @SuppressWarnings("unchecked")
+ private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
+ return (Constructor<T>) ctor;
+ }
+
+ @SuppressWarnings("unchecked")
+ static <T> T newRandomArg(Random random, Class<T> paramType) {
+ final Function<Random, Object> producer = argProducers.get(paramType);
+ assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
+ return (T) producer.apply(random);
+ }
+
+ static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ args[i] = newRandomArg(random, paramType);
+ }
+ return args;
+ }
+
+ static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ if (paramType == Reader.class) {
+ args[i] = reader;
+ } else {
+ args[i] = newRandomArg(random, paramType);
+ }
+ }
+ return args;
+ }
+
+ static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ if (paramType == TokenStream.class) {
+ args[i] = stream;
+ } else if (paramType == CommonGramsFilter.class) {
+ // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
+ args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class));
+ } else {
+ args[i] = newRandomArg(random, paramType);
+ }
+ }
+ return args;
+ }
+
+ static class MockRandomAnalyzer extends Analyzer {
+ final long seed;
+
+ MockRandomAnalyzer(long seed) {
+ this.seed = seed;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Random random = new Random(seed);
+ TokenizerSpec tokenizerSpec = newTokenizer(random);
+ // System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
+ TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
+ // System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
+ return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ Random random = new Random(seed);
+ CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
+ return charfilterspec.reader;
+ }
+
+ @Override
+ public String toString() {
+ Random random = new Random(seed);
+ StringBuilder sb = new StringBuilder();
+ CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
+ sb.append("\ncharfilters=");
+ sb.append(charFilterSpec.toString);
+ // intentional: initReader gets its own separate random
+ random = new Random(seed);
+ TokenizerSpec tokenizerSpec = newTokenizer(random);
+ sb.append("\n");
+ sb.append("tokenizer=");
+ sb.append(tokenizerSpec.toString);
+ TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
+ sb.append("\n");
+ sb.append("filters=");
+ sb.append(tokenFilterSpec.toString);
+ return sb.toString();
+ }
+
+ private <T> T createComponent(
+ Constructor<T> ctor, Object[] args, StringBuilder descr, boolean isConditional) {
+ try {
+ final T instance = ctor.newInstance(args);
+ /*
+ if (descr.length() > 0) {
+ descr.append(",");
+ }
+ */
+ descr.append("\n ");
+ if (isConditional) {
+ descr.append("Conditional:");
+ }
+ descr.append(ctor.getDeclaringClass().getName());
+ String params = Arrays.deepToString(args);
+ params = params.substring(1, params.length() - 1);
+ descr.append("(").append(params).append(")");
+ return instance;
+ } catch (InvocationTargetException ite) {
+ final Throwable cause = ite.getCause();
+ if (cause instanceof IllegalArgumentException
+ || cause instanceof UnsupportedOperationException) {
+ // thats ok, ignore
+ if (VERBOSE) {
+ System.err.println("Ignoring IAE/UOE from ctor:");
+ cause.printStackTrace(System.err);
+ }
+ } else {
+ Rethrow.rethrow(cause);
+ }
+ } catch (IllegalAccessException | InstantiationException iae) {
+ Rethrow.rethrow(iae);
+ }
+ return null; // no success
+ }
+
+ private boolean broken(Constructor<?> ctor, Object[] args) {
+ final Predicate<Object[]> pred = brokenConstructors.get(ctor);
+ return pred != null && pred.test(args);
+ }
+
+ // create a new random tokenizer from classpath
+ private TokenizerSpec newTokenizer(Random random) {
+ TokenizerSpec spec = new TokenizerSpec();
+ while (spec.tokenizer == null) {
+ final Constructor<? extends Tokenizer> ctor =
+ tokenizers.get(random.nextInt(tokenizers.size()));
+ final StringBuilder descr = new StringBuilder();
+ final Object[] args = newTokenizerArgs(random, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ continue;
+ }
+ spec.tokenizer = createComponent(ctor, args, descr, false);
+ if (spec.tokenizer != null) {
+ spec.toString = descr.toString();
+ }
+ }
+ return spec;
+ }
+
+ private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
+ CharFilterSpec spec = new CharFilterSpec();
+ spec.reader = reader;
+ StringBuilder descr = new StringBuilder();
+ int numFilters = random.nextInt(3);
+ for (int i = 0; i < numFilters; i++) {
+ while (true) {
+ final Constructor<? extends CharFilter> ctor =
+ charfilters.get(random.nextInt(charfilters.size()));
+ final Object[] args = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ continue;
+ }
+ reader = createComponent(ctor, args, descr, false);
+ if (reader != null) {
+ spec.reader = reader;
+ break;
+ }
+ }
+ }
+ spec.toString = descr.toString();
+ return spec;
+ }
+
+ private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
+ TokenFilterSpec spec = new TokenFilterSpec();
+ spec.stream = tokenizer;
+ StringBuilder descr = new StringBuilder();
+ int numFilters = random.nextInt(5);
+ for (int i = 0; i < numFilters; i++) {
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+
+ while (true) {
+ final Constructor<? extends TokenFilter> ctor =
+ tokenfilters.get(random.nextInt(tokenfilters.size()));
+ if (random.nextBoolean()
+ && avoidConditionals.contains(ctor.getDeclaringClass()) == false) {
+ long seed = random.nextLong();
+ spec.stream =
+ new ConditionalTokenFilter(
+ spec.stream,
+ in -> {
+ final Object[] args = newFilterArgs(random, in, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ return in;
+ }
+ TokenStream ts = createComponent(ctor, args, descr, true);
+ if (ts == null) {
+ return in;
+ }
+ return ts;
+ }) {
+ Random random = new Random(seed);
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ random = new Random(seed);
+ }
+
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return random.nextBoolean();
+ }
+ };
+ break;
+ } else {
+ final Object[] args = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ continue;
+ }
+ final TokenFilter flt = createComponent(ctor, args, descr, false);
+ if (flt != null) {
+ spec.stream = flt;
+ break;
+ }
+ }
+ }
+ }
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+
+ spec.toString = descr.toString();
+ return spec;
+ }
+ }
+
+ static class TokenizerSpec {
+ Tokenizer tokenizer;
+ String toString;
+ }
+
+ static class TokenFilterSpec {
+ TokenStream stream;
+ String toString;
+ }
+
+ static class CharFilterSpec {
+ Reader reader;
+ String toString;
+ }
+
+ public void testRandomChains() throws Throwable {
+ int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
+ Random random = random();
+ for (int i = 0; i < numIterations; i++) {
+ try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
+ if (VERBOSE) {
+ System.out.println("Creating random analyzer:" + a);
+ }
+ try {
+ checkNormalize(a);
+ checkRandomData(
+ random,
+ a,
+ 500 * RANDOM_MULTIPLIER,
+ 20,
+ false,
+ false /* We already validate our own offsets... */);
+ } catch (Throwable e) {
+ System.err.println("Exception from random analyzer: " + a);
+ throw e;
+ }
+ }
+ }
+ }
+
+ public void checkNormalize(Analyzer a) {
+ // normalization should not modify characters that may be used for wildcards
+ // or regular expressions
+ String s = "([0-9]+)?*";
+ assertEquals(s, a.normalize("dummy", s).utf8ToString());
+ }
+
+ // we might regret this decision...
+ public void testRandomChainsWithLargeStrings() throws Throwable {
+ int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
+ Random random = random();
+ for (int i = 0; i < numIterations; i++) {
+ try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
+ if (VERBOSE) {
+ System.out.println("Creating random analyzer:" + a);
+ }
+ try {
+ checkRandomData(
+ random,
+ a,
+ 50 * RANDOM_MULTIPLIER,
+ 80,
+ false,
+ false /* We already validate our own offsets... */);
+ } catch (Throwable e) {
+ System.err.println("Exception from random analyzer: " + a);
+ throw e;
+ }
+ }
+ }
+ }
+}
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml
new file mode 100644
index 0000000..2c8d203
--- /dev/null
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/da_UTF8.xml
@@ -0,0 +1,1208 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
+<!--
+ Copyright 1999-2004 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+This file contains the hyphenation patterns for danish.
+Adapted from dkhyph.tex, dkcommon.tex and dkspecial.tex
+originally created by Frank Jensen (fj@iesd.auc.dk).
+FOP adaptation by Carlos Villegas (cav@uniscope.co.jp)
+-->
+<hyphenation-info>
+
+<hyphen-char value="-"/>
+<hyphen-min before="2" after="2"/>
+
+<classes>
+aA
+bB
+cC
+dD
+eE
+fF
+gG
+hH
+iI
+jJ
+kK
+lL
+mM
+nN
+oO
+pP
+qQ
+rR
+sS
+tT
+uU
+vV
+wW
+xX
+yY
+zZ
+ÊÃ
+ÞÃ
+Ã¥Ã
+</classes>
+<patterns>
+<!-- dkcommon -->
+.ae3
+.an3k
+.an1s
+.be5la
+.be1t
+.bi4tr
+.der3i
+.diagno5
+.her3
+.hoved3
+.ne4t5
+.om1
+.ove4
+.po1
+.til3
+.yd5r
+ab5le
+3abst
+a3c
+ade5la
+5adg
+a1e
+5afg
+5a4f1l
+af3r
+af4ri
+5afs
+a4gef
+a4gi
+ag5in
+ag5si
+3agti
+a4gy
+a3h
+ais5t
+a3j
+a5ka
+a3ke
+a5kr
+aku5
+a3la
+a1le
+a1li
+al3k
+4alkv
+a1lo
+al5si
+a3lu
+a1ly
+am4pa
+3analy
+an4k5r
+a3nu
+3anv
+a5o
+a5pe
+a3pi
+a5po
+a1ra
+ar5af
+1arb
+a1re
+5arg
+a1ri
+a3ro
+a3sa
+a3sc
+a1si
+a3sk
+a3so
+3a3sp
+a3ste
+a3sti
+a1ta1
+a1te
+a1ti
+a4t5in
+a1to
+ato5v
+a5tr
+a1tu
+a5va
+a1ve
+a5z
+1ba
+ba4ti
+4bd
+1be
+be1k
+be3ro
+be5ru
+be1s4
+be1tr
+1bi
+bi5sk
+b1j
+4b1n
+1bo
+bo4gr
+bo3ra
+bo5re
+1br4
+4bs
+bs5k
+b3so
+b1st
+b5t
+3bu
+bu4s5tr
+b5w
+1by
+by5s
+4c1c
+1ce
+ce5ro
+3ch
+4ch.
+ci4o
+ck3
+5cy
+3da
+4d3af
+d5anta
+da4s
+d1b
+d1d4
+1de
+de5d
+4de4lem
+der5eri
+de4rig
+de5sk
+d1f
+d1g
+d3h
+1di
+di1e
+di5l
+d3j
+d1k
+d1l
+d1m
+4d1n
+3do
+4dop
+d5ov
+d1p
+4drett
+5d4reve
+3drif
+3driv
+d5ros
+d5ru
+ds5an
+ds5in
+d1ski
+d4sm
+d4su
+dsu5l
+ds5vi
+d3ta
+d1te
+dt5o
+d5tr
+dt5u
+1du
+dub5
+d1v
+3dy
+e5ad
+e3af
+e5ag
+e3ak
+e1al
+ea4la
+e3an
+e5ap
+e3at
+e3bl
+ebs3
+e1ci
+ed5ar
+edde4
+eddel5
+e4do
+ed5ra
+ed3re
+ed3rin
+ed4str
+e3e
+3eff
+e3fr
+3eft
+e3gu
+e1h
+e3in
+ei5s
+e3je
+e4j5el
+e1ka
+e3ke
+e3kl
+4e1ko
+e5kr
+ek5sa
+3eksem
+3eksp
+e3ku
+e1kv
+e5ky
+e3lad
+el3ak
+el3ar
+e1las
+e3le
+e4lek
+3elem
+e1li
+5elim
+e3lo
+el5sa
+e5lu
+e3ly
+e4mad
+em4p5le
+em1s
+en5ak
+e4nan
+4enn
+e4no
+en3so
+e5nu
+e5ol
+e3op
+e1or
+e3ov
+epi3
+e1pr
+e3ra
+er3af
+e4rag
+e4rak
+e1re
+e4ref
+er5ege
+5erhv
+e1ri
+e4rib
+er1k
+ero5d
+er5ov
+er3s
+er5tr
+e3rum
+er5un
+e5ry
+e1ta
+e1te
+etek4s
+e1ti
+e3tj
+e1to
+e3tr
+e3tu
+e1ty
+e3um
+e3un
+3eur
+e1va
+e3ve
+e4v3erf
+e1vi
+e5x
+1fa
+fa4ce
+fags3
+f1b
+f1d
+1fe
+fej4
+fejl1
+f1f
+f1g
+f1h
+1fi
+f1k
+3fl
+1fo
+for1en
+fo4ri
+f1p
+f1s4
+4ft
+f3ta
+f1te
+f1ti
+f5to
+f5tvi
+1fu
+f1v
+3fy
+1ga
+g3art
+g1b
+g1d
+1ge
+4g5enden
+ger3in
+ge3s
+g3f
+g1g
+g1h
+1gi
+gi4b
+gi3st
+5gj
+g3k
+g1l
+g1m
+3go
+4g5om
+g5ov
+g3p
+1gr
+gs1a
+gsde4len
+g4se
+gsha4
+g5sla
+gs3or
+gs1p
+g5s4tide
+g4str
+gs1v
+g3ta
+g1te
+g1ti
+g5to
+g3tr
+gt4s
+g3ud
+gun5
+g3v
+1gy
+g5yd
+4ha.
+heds3
+he5s
+4het
+hi4e
+hi4n5
+hi3s
+ho5ko
+ho5ve
+4h3t
+hun4
+hund3
+hvo4
+i1a
+i3b
+i4ble
+i1c
+i3dr
+ids5k
+i1el
+i1en
+i3er
+i3et.
+if3r
+i3gu
+i3h
+i5i
+i5j
+i1ka
+i1ke
+ik1l
+i5ko
+ik3re
+ik5ri
+iks5t
+ik4tu
+i3ku
+ik3v
+i3lag
+il3eg
+il5ej
+il5el
+i3li
+i4l5id
+il3k
+i1lo
+il5u
+i3mu
+ind3t
+5inf
+ings1
+in3s
+in4sv
+inter1
+i3nu
+i3od
+i3og
+i5ok
+i3ol
+ion4
+ions1
+i5o5r
+i3ot
+i5pi
+i3pli
+i5pr
+i3re
+i3ri
+ir5t
+i3sc
+i3si
+i4sm
+is3p
+i1ster
+i3sti
+i5sua
+i1ta
+i1te
+i1ti
+i3to
+i3tr
+it5re.
+i1tu
+i3ty
+i1u
+i1va
+i1ve
+i1vi
+j3ag
+jde4rer
+jds1
+jek4to
+4j5en.
+j5k
+j3le
+j3li
+jlmeld5
+jlmel4di
+j3r
+jre5
+ju3s
+5kap
+k5au
+5kav
+k5b
+kel5s
+ke3sk
+ke5st
+ke4t5a
+k3h
+ki3e
+ki3st
+k1k
+k5lak
+k1le
+3klu
+k4ny
+5kod
+1kon
+ko3ra
+3kort
+ko3v
+1kra
+5kry
+ks3an
+k1si
+ks3k
+ks1p
+k3ste
+k5stu
+ks5v
+k1t
+k4tar
+k4terh
+kti4e
+kt5re
+kt5s
+3kur
+1kus
+3kut
+k4vo
+k4vu
+5lab
+lad3r
+5lagd
+la4g3r
+5lam
+1lat
+l1b
+ldiagnos5
+l3dr
+ld3st
+1le.
+5led
+4lele
+le4mo
+3len
+1ler
+1les
+4leu
+l1f
+lfin4
+lfind5
+l1go1
+l3h
+li4ga
+4l5ins
+4l3int
+li5o
+l3j
+l1ke
+l1ko
+l3ky
+l1l
+l5mu
+lo4du
+l3op
+4l5or
+3lov
+4l3p
+l4ps
+l3r
+4ls
+lses1
+ls5in
+l5sj
+l1ta
+l4taf
+l1te
+l4t5erf
+l3ti
+lt3o
+l3tr
+l3tu
+lu5l
+l3ve
+l3vi
+1ma
+m1b
+m3d
+1me
+4m5ej
+m3f
+m1g
+m3h
+1mi
+mi3k
+m5ing
+mi4o
+mi5sty
+m3k
+m1l
+m1m
+mmen5
+m1n
+3mo
+mo4da
+4mop
+4m5ov
+m1pe
+m3pi
+m3pl
+m1po
+m3pr
+m1r
+mse5s
+ms5in
+m5sk
+ms3p
+m3ste
+ms5v
+m3ta
+m3te
+m3ti
+m3tr
+m1ud
+1mul
+mu1li
+3my
+3na
+4nak
+1nal
+n1b
+n1c
+4nd
+n3dr
+nd5si
+nd5sk
+nd5sp
+1ne
+ne5a
+ne4da
+nemen4
+nement5e
+neo4
+n3erk
+n5erl
+ne5sl
+ne5st
+n1f
+n4go
+4n1h
+1ni
+4nim
+ni5o
+ni3st
+n1ke
+n1ko
+n3kr
+n3ku
+n5kv
+4n1l
+n1m
+n1n
+1no
+n3ord
+n5p
+n3r
+4ns
+n3si
+n1sku
+ns3po
+n1sta
+n5sti
+n1ta
+nta4le
+n1te
+n1ti
+ntiali4
+n3to
+n1tr
+nt4s5t
+nt4su
+n3tu
+n3ty
+4n1v
+3ny
+n3z
+o3a
+o4as
+ob3li
+o1c
+o4din
+od5ri
+od5s
+od5un
+o1e
+of5r
+o4gek
+o4gel
+o4g5o
+og5re
+og5sk
+o5h
+o5in
+oi6s5e
+o1j
+o3ka
+o1ke
+o3ku
+o3la
+o3le
+o1li
+o1lo
+o3lu
+o5ly
+1omr
+on3k
+ook5
+o3or
+o5ov
+o3pi
+op3l
+op3r
+op3s
+3opta
+4or.
+or1an
+3ordn
+ord5s
+o3re.
+o3reg
+o3rek
+o3rer
+o3re3s
+o3ret
+o3ri
+3orient
+or5im
+o4r5in
+or3k
+or5o
+or3sl
+or3st
+o3si
+o3so
+o3t
+o1te
+o5un
+ov4s
+3pa
+pa5gh
+p5anl
+p3d
+4pec
+3pen
+1per
+pe1ra
+pe5s
+pe3u
+p3f
+4p5h
+1pla
+p4lan
+4ple.
+4pler
+4ples
+p3m
+p3n
+5pok
+4po3re
+3pot
+4p5p4
+p4ro
+1proc
+p3sk
+p5so
+ps4p
+p3st
+p1t
+1pu
+pu5b
+p5ule
+p5v
+5py3
+qu4
+4raf
+ra5is
+4rarb
+r1b
+r4d5ar
+r3dr
+rd4s3
+4reks
+1rel
+re5la
+r5enss
+5rese
+re5spo
+4ress
+re3st
+re5s4u
+5rett
+r1f
+r1gu
+r1h
+ri1e
+ri5la
+4rimo
+r4ing
+ringse4
+ringso4r
+4rinp
+4rint
+r3ka
+r1ke
+r1ki
+rk3so
+r3ku
+r1l
+rmo4
+r5mu
+r1n
+ro1b
+ro3p
+r3or
+r3p
+r1r
+rre5s
+rro4n5
+r1sa
+r1si
+r5skr
+r4sk5v
+rs4n
+r3sp
+r5stu
+r5su
+r3sv
+r5tal
+r1te
+r4teli
+r1ti
+r3to
+r4t5or
+rt5rat
+rt3re
+r5tri
+r5tro
+rt3s
+r5ty
+r3ud
+run4da
+5rut
+r3va
+r1ve
+r3vi
+ry4s
+s3af
+1sam
+sa4ma
+s3ap
+s1ar
+1sat
+4s1b
+s1d
+sdy4
+1se
+s4ed
+5s4er
+se4se
+s1f
+4s1g4
+4s3h
+si4bl
+1sig
+s5int
+5sis
+5sit
+5siu
+s5ju
+4sk.
+1skab
+1ske
+s3kl
+sk5s4
+5sky
+s1le
+s1li
+slo3
+5slu
+s5ly
+s1m
+s4my
+4snin
+s4nit
+so5k
+5sol
+5som.
+3somm
+s5oms
+5somt
+3son
+4s1op
+sp4
+3spec
+4sper
+3s4pi
+s1pl
+3sprog.
+s5r4
+s1s4
+4st.
+5s4tam
+1stan
+st5as
+3stat
+1stav
+1ste.
+1sted
+3stel
+5stemo
+1sten
+5step
+3ster.
+3stes
+5stet
+5stj
+3sto
+st5om
+1str
+s1ud
+3sul
+s3un
+3sur
+s3ve
+3s4y
+1sy1s
+5ta.
+1tag
+tands3
+4tanv
+4tb
+tede4l
+teds5
+3teg
+5tekn
+teo1
+5term
+te5ro
+4t1f
+6t3g
+t1h
+tialis5t
+3tid
+ti4en
+ti3st
+4t3k
+4t1l
+tli4s5
+t1m
+t1n
+to5ra
+to1re
+to1ri
+tor4m
+4t3p
+t4ra
+4tres
+tro5v
+1try
+4ts
+t3si
+ts4pa
+ts5pr
+t3st
+ts5ul
+4t1t
+t5uds
+5tur
+t5ve
+1typ
+u1a
+5udl
+ud5r
+ud3s
+3udv
+u1e
+ue4t5
+uge4ri
+ugs3
+u5gu
+u3i
+u5kl
+uk4ta
+uk4tr
+u1la
+u1le
+u5ly
+u5pe
+up5l
+u5q
+u3ra
+u3re
+u4r3eg
+u1rer
+u3ro
+us5a
+u3si
+u5ska
+u5so
+us5v
+u1te
+u1ti
+u1to
+ut5r
+ut5s4
+5u5v
+va5d
+3varm
+1ved
+ve4l5e
+ve4reg
+ve3s
+5vet
+v5h
+vi4l3in
+1vis
+v5j
+v5k
+vl4
+v3le
+v5li
+vls1
+1vo
+4v5om
+v5p
+v5re
+v3st
+v5su
+v5t
+3vu
+y3a
+y5dr
+y3e
+y3ke
+y5ki
+yk3li
+y3ko
+yk4s5
+y3kv
+y5li
+y5lo
+y5mu
+yns5
+y5o
+y1pe
+y3pi
+y3re
+yr3ek
+y3ri
+y3si
+y3ti
+y5t3r
+y5ve
+zi5o
+<!-- dkspecial -->
+.så3
+.Êr5i
+.Þv3r
+a3tÞ
+a5vÊ
+brÞd3
+5bÊ
+5drÞv
+dstå4
+3dÊ
+3dÞ
+e3lÊ
+e3lÞ
+e3rÞ
+er5Þn
+e5tÊ
+e5tÞ
+e1vÊ
+e3Ê
+e5Ã¥
+3fÊ
+3fÞ
+fÞ4r5en
+giÞ4
+g4sÞ
+g5så
+3gÊ
+3gÞ1
+3gå
+i5tÊ
+i3Þ
+3kÞ
+3kå
+lingeniÞ4
+l3vÊ
+5lÞs
+m5tå
+1mÊ
+3mÞ
+3må
+n3kÊ
+n5tÊ
+3nÊ
+4n5Êb
+5nÞ
+o5lÊ
+or3Þ
+o5Ã¥
+5prÊ
+5pÊd
+på3
+r5kÊ
+r5tÊ
+r5tÞ
+r3vÊ
+r5Êl
+4rÞn
+5rÞr
+3råd
+r5Ã¥r
+s4kå
+3slå
+s4nÊ
+5stÞ
+1stå
+1sÊ
+4s5Ên
+1sÞ
+s5Þk
+så4r5
+ti4Þ
+3trÊk.
+t4sÞ
+t5så
+t3vÊ
+u3lÊ
+3vÊrd
+1vÊrk
+5vå
+y5vÊ
+Êb3l
+Ê3c
+Ê3e
+Êg5a
+Ê4gek
+Ê4g5r
+Êgs5
+Ê5i
+Ê5kv
+Êlle4
+Ên1dr
+Ê5o
+Ê1re
+Êr4g5r
+Ê3ri
+Êr4ma
+Êr4mo
+Êr5s
+Ê5si
+Ê3so
+Ê3ste
+Ê3ve
+Þde5
+Þ3e
+Þ1je
+Þ3ke
+Þ3le
+Þms5
+Þn3st
+Þn4t3
+Þ1re
+Þ3ri
+Þrne3
+Þr5o
+Þ1ve
+Ã¥1d
+Ã¥1e
+Ã¥5h
+Ã¥3l
+Ã¥3re
+Ã¥rs5t
+Ã¥5sk
+Ã¥3t
+</patterns>
+</hyphenation-info>
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff
new file mode 100644
index 0000000..aaf4a6c
--- /dev/null
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.aff
@@ -0,0 +1,20 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 3
+SFX A 0 e n
+SFX A 0 e t
+SFX A 0 e h
+
+SFX C Y 2
+SFX C 0 d/C c
+SFX C 0 c b
+
+SFX D Y 1
+SFX D 0 s o
+
+SFX E Y 1
+SFX E 0 d o
+
+PFX B Y 1
+PFX B 0 s o
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic
new file mode 100644
index 0000000..2809611
--- /dev/null
+++ b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/simple.dic
@@ -0,0 +1,11 @@
+9
+ab/C
+apach/A
+foo/D
+foo/E
+lucen/A
+lucene
+mahout/A
+moo/E
+olr/B
+db
\ No newline at end of file
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
index b70768e..9e693ca 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
@@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Characters before the delimiter are the "token", those after are the boost.
@@ -30,6 +31,8 @@ import org.apache.lucene.search.BoostAttribute;
*
* <p>Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
+@IgnoreRandomChains(
+ reason = "requires a special encoded token value, so it may fail with random data")
public final class DelimitedBoostTokenFilter extends TokenFilter {
private final char delimiter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
index 2f9337d..a384dba 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer.
@@ -47,6 +48,7 @@ import org.apache.lucene.util.ArrayUtil;
*
* <p>In all cases, all non-CJK input is passed thru unmodified.
*/
+@IgnoreRandomChains(reason = "LUCENE-8092: doesn't handle graph inputs")
public final class CJKBigramFilter extends TokenFilter {
// configuration
/** bigram flag for Han Ideographs */
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
index d1a81c1..0979ade 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/*
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
@@ -43,10 +44,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
* "the-quick" has a term.type() of "gram"
* </ul>
*/
-
-/*
- * Constructors and makeCommonSet based on similar code in StopFilter
- */
+@IgnoreRandomChains(reason = "LUCENE-4983")
public final class CommonGramsFilter extends TokenFilter {
public static final String GRAM_TYPE = "gram";
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
index 80a6381..7a5ba13 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single words when they are
@@ -42,6 +43,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
* See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
* http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
*/
+@IgnoreRandomChains(reason = "TODO: doesn't handle graph inputs")
public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
index 144fe06..fafdec7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Normalizes token text to lower case.
@@ -27,6 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
* @see org.apache.lucene.analysis.LowerCaseFilter
* @see LowerCaseFilterFactory
*/
+@IgnoreRandomChains(reason = "clones of core's filters")
public final class LowerCaseFilter extends org.apache.lucene.analysis.LowerCaseFilter {
/**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
index 22b7561..08b170f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Removes stop words from a token stream.
@@ -28,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
* @see org.apache.lucene.analysis.StopFilter
* @see StopFilterFactory
*/
+@IgnoreRandomChains(reason = "clones of core's filters")
public final class StopFilter extends org.apache.lucene.analysis.StopFilter {
/**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
index 417602c..a42c988 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Characters before the delimiter are the "token", the textual integer after is the term frequency.
@@ -36,6 +37,8 @@ import org.apache.lucene.util.ArrayUtil;
*
* <p>Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
+@IgnoreRandomChains(
+ reason = "requires a special encoded token value, so it may fail with random data")
public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
public static final char DEFAULT_DELIMITER = '|';
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
index 47fef09..6821635 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* When the plain text is extracted from documents, we will often have many words hyphenated and
@@ -50,6 +51,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
* </fieldtype>
* </pre>
*/
+@IgnoreRandomChains(
+ reason = "TODO: doesn't handle graph inputs (or even look at positionIncrement)")
public final class HyphenatedWordsFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java
index 4627f9e..80863ee 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.java
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* This TokenFilter limits the number of tokens while indexing. It is a replacement for the maximum
@@ -45,6 +46,7 @@ public final class LimitTokenCountFilter extends TokenFilter {
*
* @see #LimitTokenCountFilter(TokenStream,int,boolean)
*/
+ @IgnoreRandomChains(reason = "all tokens must be consumed")
public LimitTokenCountFilter(TokenStream in, int maxTokenCount) {
this(in, maxTokenCount, false);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
index 757fa96..0a2db1d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Lets all tokens pass through until it sees one with a start offset <= a configured limit,
@@ -46,6 +47,7 @@ public final class LimitTokenOffsetFilter extends TokenFilter {
*
* @param maxStartOffset the maximum start offset allowed
*/
+ @IgnoreRandomChains(reason = "all tokens must be consumed")
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
this(input, maxStartOffset, false);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java
index 6230ee7..edbee58 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenPositionFilter.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* This TokenFilter limits its emitted tokens to those with positions that are not greater than the
@@ -50,6 +51,7 @@ public final class LimitTokenPositionFilter extends TokenFilter {
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
* @see #LimitTokenPositionFilter(TokenStream,int,boolean)
*/
+ @IgnoreRandomChains(reason = "all tokens must be consumed")
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) {
this(in, maxTokenPosition, false);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 2971704..8b871d3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;
@@ -83,6 +84,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* StandardTokenizer} immediately removes many intra-word delimiters, it is recommended that this
* filter be used after a tokenizer that does not do this (such as {@link WhitespaceTokenizer}).
*/
+@IgnoreRandomChains(reason = "Cannot correct offsets when a char filter had changed them")
public final class WordDelimiterGraphFilter extends TokenFilter {
/**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
index 57fe65b..e7dfa32 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Tokenizer for path-like hierarchies.
@@ -40,6 +41,7 @@ import org.apache.lucene.util.AttributeFactory;
* /something/something/else
* </pre>
*/
+@IgnoreRandomChains(reason = "broken offsets")
public class PathHierarchyTokenizer extends Tokenizer {
public PathHierarchyTokenizer() {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
index d1cdb3a9..7b1f60f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Tokenizer for domain-like hierarchies.
@@ -43,6 +44,7 @@ import org.apache.lucene.util.AttributeFactory;
* uk
* </pre>
*/
+@IgnoreRandomChains(reason = "broken offsets")
public class ReversePathHierarchyTokenizer extends Tokenizer {
public ReversePathHierarchyTokenizer() {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
index 76ef11b..bdb8799 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
@@ -34,6 +35,7 @@ import org.apache.lucene.util.AttributeSource;
*
* @lucene.experimental
*/
+@IgnoreRandomChains(reason = "TODO: it seems to mess up offsets!?")
public final class WikipediaTokenizer extends Tokenizer {
public static final String INTERNAL_LINK = "il";
public static final String EXTERNAL_LINK = "el";
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
index 28777e8..0ee336a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
@@ -69,7 +69,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
protected Reader initReader(String fieldName, Reader reader) {
reader = new MockCharFilter(reader, 0);
reader = new MappingCharFilter(map, reader);
- reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
+ reader = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
return reader;
}
};
@@ -137,7 +137,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
};
public void testWrapping() throws Exception {
- CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
+ CharFilter cs = new CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
Exception expected =
expectThrows(
Exception.class,
@@ -221,6 +221,69 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
// todo: test framework?
+ static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
+ boolean readSomething;
+
+ CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
+ super(in);
+ }
+
+ @Override
+ public int correct(int currentOff) {
+ return currentOff; // we don't change any offsets
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ readSomething = true;
+ return input.read(cbuf, off, len);
+ }
+
+ @Override
+ public int read() throws IOException {
+ readSomething = true;
+ return input.read();
+ }
+
+ @Override
+ public int read(CharBuffer target) throws IOException {
+ readSomething = true;
+ return input.read(target);
+ }
+
+ @Override
+ public int read(char[] cbuf) throws IOException {
+ readSomething = true;
+ return input.read(cbuf);
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ readSomething = true;
+ return input.skip(n);
+ }
+
+ @Override
+ public void mark(int readAheadLimit) throws IOException {
+ input.mark(readAheadLimit);
+ }
+
+ @Override
+ public boolean markSupported() {
+ return input.markSupported();
+ }
+
+ @Override
+ public boolean ready() throws IOException {
+ return input.ready();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ }
+ }
+
static final class SopTokenFilter extends TokenFilter {
SopTokenFilter(TokenStream input) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java
index 95b8bdd..5ae7b11 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java
@@ -33,10 +33,10 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.util.LuceneTestCase.Nightly;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.util.Version;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
deleted file mode 100644
index 98256b3..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ /dev/null
@@ -1,1045 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.io.StringReader;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Modifier;
-import java.net.URI;
-import java.net.URL;
-import java.nio.CharBuffer;
-import java.nio.file.DirectoryStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.text.DateFormat;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.IdentityHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.function.Function;
-import java.util.function.Predicate;
-import java.util.regex.Pattern;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.CharArrayMap;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.CharFilter;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilter;
-import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
-import org.apache.lucene.analysis.cjk.CJKBigramFilter;
-import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
-import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
-import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
-import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
-import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.hunspell.Dictionary;
-import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
-import org.apache.lucene.analysis.minhash.MinHashFilter;
-import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
-import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
-import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
-import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
-import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
-import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
-import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
-import org.apache.lucene.analysis.pattern.PatternTypingFilter;
-import org.apache.lucene.analysis.payloads.IdentityEncoder;
-import org.apache.lucene.analysis.payloads.PayloadEncoder;
-import org.apache.lucene.analysis.shingle.FixedShingleFilter;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.snowball.TestSnowball;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-import org.apache.lucene.store.ByteBuffersDirectory;
-import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.tests.analysis.CrankyTokenFilter;
-import org.apache.lucene.tests.analysis.MockTokenFilter;
-import org.apache.lucene.tests.analysis.MockTokenizer;
-import org.apache.lucene.tests.analysis.ValidatingTokenFilter;
-import org.apache.lucene.tests.util.Rethrow;
-import org.apache.lucene.tests.util.TestUtil;
-import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.Version;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.RegExp;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.tartarus.snowball.SnowballStemmer;
-import org.xml.sax.InputSource;
-
-/** tests random analysis chains */
-public class TestRandomChains extends BaseTokenStreamTestCase {
-
- static List<Constructor<? extends Tokenizer>> tokenizers;
- static List<Constructor<? extends TokenFilter>> tokenfilters;
- static List<Constructor<? extends CharFilter>> charfilters;
-
- private static final Predicate<Object[]> ALWAYS = (objects -> true);
-
- private static final Set<Class<?>> avoidConditionals = new HashSet<>();
-
- static {
- // These filters needs to consume the whole tokenstream, so conditionals don't make sense here
- avoidConditionals.add(FingerprintFilter.class);
- avoidConditionals.add(MinHashFilter.class);
- avoidConditionals.add(ConcatenateGraphFilter.class);
- // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can
- // expose inconsistent offsets
- // https://issues.apache.org/jira/browse/LUCENE-4170
- avoidConditionals.add(ShingleFilter.class);
- avoidConditionals.add(FixedShingleFilter.class);
- // FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition
- // can break position lengths
- avoidConditionals.add(FlattenGraphFilter.class);
- // LimitToken*Filters don't set end offsets correctly
- avoidConditionals.add(LimitTokenOffsetFilter.class);
- avoidConditionals.add(LimitTokenCountFilter.class);
- avoidConditionals.add(LimitTokenPositionFilter.class);
- }
-
- private static final Map<Constructor<?>, Predicate<Object[]>> brokenConstructors =
- new HashMap<>();
-
- static {
- initBrokenConstructors();
- }
-
- @SuppressWarnings("deprecation")
- private static void initBrokenConstructors() {
- try {
- brokenConstructors.put(
- LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS);
- brokenConstructors.put(
- LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
- args -> {
- assert args.length == 3;
- return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
- });
- brokenConstructors.put(
- LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS);
- brokenConstructors.put(
- LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
- args -> {
- assert args.length == 3;
- return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
- });
- brokenConstructors.put(
- LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS);
- brokenConstructors.put(
- LimitTokenPositionFilter.class.getConstructor(
- TokenStream.class, int.class, boolean.class),
- args -> {
- assert args.length == 3;
- return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
- });
- for (Class<?> c :
- Arrays.<Class<?>>asList(
- // doesn't actual reset itself! TODO this statement is probably obsolete as of
- // LUCENE-6121 ?
- CachingTokenFilter.class,
- // LUCENE-8092: doesn't handle graph inputs
- CJKBigramFilter.class,
- // TODO: LUCENE-4983
- CommonGramsFilter.class,
- // TODO: doesn't handle graph inputs
- CommonGramsQueryFilter.class,
- // Not broken, simulates brokenness:
- CrankyTokenFilter.class,
- // TODO: doesn't handle graph inputs (or even look at positionIncrement)
- HyphenatedWordsFilter.class,
- // broken offsets
- PathHierarchyTokenizer.class,
- // broken offsets
- ReversePathHierarchyTokenizer.class,
- // Not broken: we forcefully add this, so we shouldn't
- // also randomly pick it:
- ValidatingTokenFilter.class,
- // TODO: it seems to mess up offsets!?
- WikipediaTokenizer.class,
- // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or
- // similar following will then cause pain)
- WordDelimiterFilter.class,
- // Cannot correct offsets when a char filter had changed them:
- WordDelimiterGraphFilter.class,
- // requires a special encoded token value, so it may fail with random data:
- DelimitedTermFrequencyTokenFilter.class,
- // requires a special encoded token value, so it may fail with random data:
- DelimitedBoostTokenFilter.class,
- // clones of core's filters:
- org.apache.lucene.analysis.core.StopFilter.class,
- org.apache.lucene.analysis.core.LowerCaseFilter.class)) {
- for (Constructor<?> ctor : c.getConstructors()) {
- brokenConstructors.put(ctor, ALWAYS);
- }
- }
- } catch (Exception e) {
- throw new Error(e);
- }
- }
-
- @BeforeClass
- public static void beforeClass() throws Exception {
- List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
- tokenizers = new ArrayList<>();
- tokenfilters = new ArrayList<>();
- charfilters = new ArrayList<>();
- for (final Class<?> c : analysisClasses) {
- final int modifiers = c.getModifiers();
- if (
- // don't waste time with abstract classes or deprecated known-buggy ones
- Modifier.isAbstract(modifiers)
- || !Modifier.isPublic(modifiers)
- || c.isSynthetic()
- || c.isAnonymousClass()
- || c.isMemberClass()
- || c.isInterface()
- || c.isAnnotationPresent(Deprecated.class)
- || !(Tokenizer.class.isAssignableFrom(c)
- || TokenFilter.class.isAssignableFrom(c)
- || CharFilter.class.isAssignableFrom(c))) {
- continue;
- }
-
- for (final Constructor<?> ctor : c.getConstructors()) {
- // don't test synthetic or deprecated ctors, they likely have known bugs:
- if (ctor.isSynthetic()
- || ctor.isAnnotationPresent(Deprecated.class)
- || brokenConstructors.get(ctor) == ALWAYS) {
- continue;
- }
- // conditional filters are tested elsewhere
- if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
- continue;
- }
- if (Tokenizer.class.isAssignableFrom(c)) {
- assertTrue(
- ctor.toGenericString() + " has unsupported parameter types",
- allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
- tokenizers.add(castConstructor(Tokenizer.class, ctor));
- } else if (TokenFilter.class.isAssignableFrom(c)) {
- assertTrue(
- ctor.toGenericString() + " has unsupported parameter types",
- allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
- tokenfilters.add(castConstructor(TokenFilter.class, ctor));
- } else if (CharFilter.class.isAssignableFrom(c)) {
- assertTrue(
- ctor.toGenericString() + " has unsupported parameter types",
- allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
- charfilters.add(castConstructor(CharFilter.class, ctor));
- } else {
- fail("Cannot get here");
- }
- }
- }
-
- final Comparator<Constructor<?>> ctorComp =
- (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString());
- Collections.sort(tokenizers, ctorComp);
- Collections.sort(tokenfilters, ctorComp);
- Collections.sort(charfilters, ctorComp);
- if (VERBOSE) {
- System.out.println("tokenizers = " + tokenizers);
- System.out.println("tokenfilters = " + tokenfilters);
- System.out.println("charfilters = " + charfilters);
- }
- }
-
- @AfterClass
- public static void afterClass() {
- tokenizers = null;
- tokenfilters = null;
- charfilters = null;
- }
-
- /**
- * Hack to work around the stupidness of Oracle's strict Java backwards compatibility. {@code
- * Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array!
- */
- @SuppressWarnings("unchecked")
- private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
- return (Constructor<T>) ctor;
- }
-
- public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception {
- final List<Class<?>> classes = new ArrayList<>();
- collectClassesForPackage(pckgname, classes);
- assertFalse(
- "No classes found in package '"
- + pckgname
- + "'; maybe your test classes are packaged as JAR file?",
- classes.isEmpty());
- return classes;
- }
-
- private static void collectClassesForPackage(String pckgname, List<Class<?>> classes)
- throws Exception {
- final ClassLoader cld = TestRandomChains.class.getClassLoader();
- final String path = pckgname.replace('.', '/');
- final Enumeration<URL> resources = cld.getResources(path);
- while (resources.hasMoreElements()) {
- final URI uri = resources.nextElement().toURI();
- if (!"file".equalsIgnoreCase(uri.getScheme())) continue;
- final Path directory = Paths.get(uri);
- if (Files.exists(directory)) {
- try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) {
- for (Path file : stream) {
- if (Files.isDirectory(file)) {
- // recurse
- String subPackage = pckgname + "." + file.getFileName().toString();
- collectClassesForPackage(subPackage, classes);
- }
- String fname = file.getFileName().toString();
- if (fname.endsWith(".class")) {
- String clazzName = fname.substring(0, fname.length() - 6);
- // exclude Test classes that happen to be in these packages.
- // class.ForName'ing some of them can cause trouble.
- if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) {
- // Don't run static initializers, as we won't use most of them.
- // Java will do that automatically once accessed/instantiated.
- classes.add(Class.forName(pckgname + '.' + clazzName, false, cld));
- }
- }
- }
- }
- }
- }
- }
-
- private static final Map<Class<?>, Function<Random, Object>> argProducers =
- new IdentityHashMap<Class<?>, Function<Random, Object>>() {
- {
- put(
- int.class,
- random -> {
- // TODO: could cause huge ram usage to use full int range for some filters
- // (e.g. allocate enormous arrays)
- // return Integer.valueOf(random.nextInt());
- return Integer.valueOf(TestUtil.nextInt(random, -50, 50));
- });
- put(
- char.class,
- random -> {
- // TODO: fix any filters that care to throw IAE instead.
- // also add a unicode validating filter to validate termAtt?
- // return Character.valueOf((char)random.nextInt(65536));
- while (true) {
- char c = (char) random.nextInt(65536);
- if (c < '\uD800' || c > '\uDFFF') {
- return Character.valueOf(c);
- }
- }
- });
- put(float.class, Random::nextFloat);
- put(boolean.class, Random::nextBoolean);
- put(byte.class, random -> (byte) random.nextInt(256));
- put(
- byte[].class,
- random -> {
- byte[] bytes = new byte[random.nextInt(256)];
- random.nextBytes(bytes);
- return bytes;
- });
- put(Random.class, random -> new Random(random.nextLong()));
- put(Version.class, random -> Version.LATEST);
- put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory);
- put(
- Set.class,
- random -> {
- // TypeTokenFilter
- Set<String> set = new HashSet<>();
- int num = random.nextInt(5);
- for (int i = 0; i < num; i++) {
- set.add(
- StandardTokenizer.TOKEN_TYPES[
- random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
- }
- return set;
- });
- put(
- Collection.class,
- random -> {
- // CapitalizationFilter
- Collection<char[]> col = new ArrayList<>();
- int num = random.nextInt(5);
- for (int i = 0; i < num; i++) {
- col.add(TestUtil.randomSimpleString(random).toCharArray());
- }
- return col;
- });
- put(
- CharArraySet.class,
- random -> {
- int num = random.nextInt(10);
- CharArraySet set = new CharArraySet(num, random.nextBoolean());
- for (int i = 0; i < num; i++) {
- // TODO: make nastier
- set.add(TestUtil.randomSimpleString(random));
- }
- return set;
- });
- // TODO: don't want to make the exponentially slow ones Dawid documents
- // in TestPatternReplaceFilter, so dont use truly random patterns (for now)
- put(Pattern.class, random -> Pattern.compile("a"));
- put(
- Pattern[].class,
- random -> new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")});
- put(
- PayloadEncoder.class,
- random ->
- new IdentityEncoder()); // the other encoders will throw exceptions if tokens
- // arent numbers?
- put(
- Dictionary.class,
- random -> {
- // TODO: make nastier
- InputStream affixStream =
- TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
- InputStream dictStream =
- TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
- try {
- return new Dictionary(
- new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
- } catch (Exception ex) {
- Rethrow.rethrow(ex);
- return null; // unreachable code
- }
- });
- put(
- HyphenationTree.class,
- random -> {
- // TODO: make nastier
- try {
- InputSource is =
- new InputSource(
- TestCompoundWordTokenFilter.class
- .getResource("da_UTF8.xml")
- .toExternalForm());
- HyphenationTree hyphenator =
- HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
- return hyphenator;
- } catch (Exception ex) {
- Rethrow.rethrow(ex);
- return null; // unreachable code
- }
- });
- put(
- SnowballStemmer.class,
- random -> {
- try {
- String lang =
- TestSnowball.SNOWBALL_LANGS.get(
- random.nextInt(TestSnowball.SNOWBALL_LANGS.size()));
- Class<? extends SnowballStemmer> clazz =
- Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer")
- .asSubclass(SnowballStemmer.class);
- return clazz.getConstructor().newInstance();
- } catch (Exception ex) {
- Rethrow.rethrow(ex);
- return null; // unreachable code
- }
- });
- put(
- String.class,
- random -> {
- // TODO: make nastier
- if (random.nextBoolean()) {
- // a token type
- return StandardTokenizer.TOKEN_TYPES[
- random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
- } else {
- return TestUtil.randomSimpleString(random);
- }
- });
- put(
- NormalizeCharMap.class,
- random -> {
- NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
- // we can't add duplicate keys, or NormalizeCharMap gets angry
- Set<String> keys = new HashSet<>();
- int num = random.nextInt(5);
- // System.out.println("NormalizeCharMap=");
- for (int i = 0; i < num; i++) {
- String key = TestUtil.randomSimpleString(random);
- if (!keys.contains(key) && key.length() > 0) {
- String value = TestUtil.randomSimpleString(random);
- builder.add(key, value);
- keys.add(key);
- // System.out.println("mapping: '" + key + "' => '" + value + "'");
- }
- }
- return builder.build();
- });
- put(
- CharacterRunAutomaton.class,
- random -> {
- // TODO: could probably use a purely random automaton
- switch (random.nextInt(5)) {
- case 0:
- return MockTokenizer.KEYWORD;
- case 1:
- return MockTokenizer.SIMPLE;
- case 2:
- return MockTokenizer.WHITESPACE;
- case 3:
- return MockTokenFilter.EMPTY_STOPSET;
- default:
- return MockTokenFilter.ENGLISH_STOPSET;
- }
- });
- put(
- CharArrayMap.class,
- random -> {
- int num = random.nextInt(10);
- CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean());
- for (int i = 0; i < num; i++) {
- // TODO: make nastier
- map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random));
- }
- return map;
- });
- put(
- StemmerOverrideMap.class,
- random -> {
- int num = random.nextInt(10);
- StemmerOverrideFilter.Builder builder =
- new StemmerOverrideFilter.Builder(random.nextBoolean());
- for (int i = 0; i < num; i++) {
- String input = "";
- do {
- input = TestUtil.randomRealisticUnicodeString(random);
- } while (input.isEmpty());
- String out = "";
- TestUtil.randomSimpleString(random);
- do {
- out = TestUtil.randomRealisticUnicodeString(random);
- } while (out.isEmpty());
- builder.add(input, out);
- }
- try {
- return builder.build();
- } catch (Exception ex) {
- Rethrow.rethrow(ex);
- return null; // unreachable code
- }
- });
- put(
- SynonymMap.class,
- new Function<Random, Object>() {
- @Override
- public Object apply(Random random) {
- SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
- final int numEntries = atLeast(10);
- for (int j = 0; j < numEntries; j++) {
- addSyn(
- b,
- randomNonEmptyString(random),
- randomNonEmptyString(random),
- random.nextBoolean());
- }
- try {
- return b.build();
- } catch (Exception ex) {
- Rethrow.rethrow(ex);
- return null; // unreachable code
- }
- }
-
- private void addSyn(
- SynonymMap.Builder b, String input, String output, boolean keepOrig) {
- b.add(
- new CharsRef(input.replaceAll(" +", "\u0000")),
- new CharsRef(output.replaceAll(" +", "\u0000")),
- keepOrig);
- }
-
- private String randomNonEmptyString(Random random) {
- while (true) {
- final String s = TestUtil.randomUnicodeString(random).trim();
- if (s.length() != 0 && s.indexOf('\u0000') == -1) {
- return s;
- }
- }
- }
- });
- put(
- DateFormat.class,
- random -> {
- if (random.nextBoolean()) return null;
- return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
- });
- put(
- Automaton.class,
- random -> {
- return Operations.determinize(
- new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(),
- Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
- });
- put(
- PatternTypingFilter.PatternTypingRule[].class,
- random -> {
- int numRules = TestUtil.nextInt(random, 1, 3);
- PatternTypingFilter.PatternTypingRule[] patternTypingRules =
- new PatternTypingFilter.PatternTypingRule[numRules];
- for (int i = 0; i < patternTypingRules.length; i++) {
- String s = TestUtil.randomSimpleString(random, 1, 2);
- // random regex with one group
- String regex = s + "(.*)";
- // pattern rule with a template that accepts one group.
- patternTypingRules[i] =
- new PatternTypingFilter.PatternTypingRule(
- Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1");
- }
- return patternTypingRules;
- });
- }
- };
-
- static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
-
- static {
- allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
- allowedTokenizerArgs.addAll(argProducers.keySet());
- allowedTokenizerArgs.add(Reader.class);
- allowedTokenizerArgs.add(AttributeFactory.class);
- allowedTokenizerArgs.add(AttributeSource.class);
- allowedTokenizerArgs.add(Automaton.class);
-
- allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
- allowedTokenFilterArgs.addAll(argProducers.keySet());
- allowedTokenFilterArgs.add(TokenStream.class);
- // TODO: fix this one, thats broken:
- allowedTokenFilterArgs.add(CommonGramsFilter.class);
-
- allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>, Boolean>());
- allowedCharFilterArgs.addAll(argProducers.keySet());
- allowedCharFilterArgs.add(Reader.class);
- }
-
- @SuppressWarnings("unchecked")
- static <T> T newRandomArg(Random random, Class<T> paramType) {
- final Function<Random, Object> producer = argProducers.get(paramType);
- assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
- return (T) producer.apply(random);
- }
-
- static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
- Object[] args = new Object[paramTypes.length];
- for (int i = 0; i < args.length; i++) {
- Class<?> paramType = paramTypes[i];
- if (paramType == AttributeSource.class) {
- // TODO: args[i] = new AttributeSource();
- // this is currently too scary to deal with!
- args[i] = null; // force IAE
- } else {
- args[i] = newRandomArg(random, paramType);
- }
- }
- return args;
- }
-
- static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
- Object[] args = new Object[paramTypes.length];
- for (int i = 0; i < args.length; i++) {
- Class<?> paramType = paramTypes[i];
- if (paramType == Reader.class) {
- args[i] = reader;
- } else {
- args[i] = newRandomArg(random, paramType);
- }
- }
- return args;
- }
-
- static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
- Object[] args = new Object[paramTypes.length];
- for (int i = 0; i < args.length; i++) {
- Class<?> paramType = paramTypes[i];
- if (paramType == TokenStream.class) {
- args[i] = stream;
- } else if (paramType == CommonGramsFilter.class) {
- // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
- args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class));
- } else {
- args[i] = newRandomArg(random, paramType);
- }
- }
- return args;
- }
-
- static class MockRandomAnalyzer extends Analyzer {
- final long seed;
-
- MockRandomAnalyzer(long seed) {
- this.seed = seed;
- }
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Random random = new Random(seed);
- TokenizerSpec tokenizerSpec = newTokenizer(random);
- // System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
- TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
- // System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
- return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
- }
-
- @Override
- protected Reader initReader(String fieldName, Reader reader) {
- Random random = new Random(seed);
- CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
- return charfilterspec.reader;
- }
-
- @Override
- public String toString() {
- Random random = new Random(seed);
- StringBuilder sb = new StringBuilder();
- CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
- sb.append("\ncharfilters=");
- sb.append(charFilterSpec.toString);
- // intentional: initReader gets its own separate random
- random = new Random(seed);
- TokenizerSpec tokenizerSpec = newTokenizer(random);
- sb.append("\n");
- sb.append("tokenizer=");
- sb.append(tokenizerSpec.toString);
- TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
- sb.append("\n");
- sb.append("filters=");
- sb.append(tokenFilterSpec.toString);
- return sb.toString();
- }
-
- private <T> T createComponent(
- Constructor<T> ctor, Object[] args, StringBuilder descr, boolean isConditional) {
- try {
- final T instance = ctor.newInstance(args);
- /*
- if (descr.length() > 0) {
- descr.append(",");
- }
- */
- descr.append("\n ");
- if (isConditional) {
- descr.append("Conditional:");
- }
- descr.append(ctor.getDeclaringClass().getName());
- String params = Arrays.deepToString(args);
- params = params.substring(1, params.length() - 1);
- descr.append("(").append(params).append(")");
- return instance;
- } catch (InvocationTargetException ite) {
- final Throwable cause = ite.getCause();
- if (cause instanceof IllegalArgumentException
- || cause instanceof UnsupportedOperationException) {
- // thats ok, ignore
- if (VERBOSE) {
- System.err.println("Ignoring IAE/UOE from ctor:");
- cause.printStackTrace(System.err);
- }
- } else {
- Rethrow.rethrow(cause);
- }
- } catch (IllegalAccessException | InstantiationException iae) {
- Rethrow.rethrow(iae);
- }
- return null; // no success
- }
-
- private boolean broken(Constructor<?> ctor, Object[] args) {
- final Predicate<Object[]> pred = brokenConstructors.get(ctor);
- return pred != null && pred.test(args);
- }
-
- // create a new random tokenizer from classpath
- private TokenizerSpec newTokenizer(Random random) {
- TokenizerSpec spec = new TokenizerSpec();
- while (spec.tokenizer == null) {
- final Constructor<? extends Tokenizer> ctor =
- tokenizers.get(random.nextInt(tokenizers.size()));
- final StringBuilder descr = new StringBuilder();
- final Object[] args = newTokenizerArgs(random, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- continue;
- }
- spec.tokenizer = createComponent(ctor, args, descr, false);
- if (spec.tokenizer != null) {
- spec.toString = descr.toString();
- }
- }
- return spec;
- }
-
- private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
- CharFilterSpec spec = new CharFilterSpec();
- spec.reader = reader;
- StringBuilder descr = new StringBuilder();
- int numFilters = random.nextInt(3);
- for (int i = 0; i < numFilters; i++) {
- while (true) {
- final Constructor<? extends CharFilter> ctor =
- charfilters.get(random.nextInt(charfilters.size()));
- final Object[] args = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- continue;
- }
- reader = createComponent(ctor, args, descr, false);
- if (reader != null) {
- spec.reader = reader;
- break;
- }
- }
- }
- spec.toString = descr.toString();
- return spec;
- }
-
- private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
- TokenFilterSpec spec = new TokenFilterSpec();
- spec.stream = tokenizer;
- StringBuilder descr = new StringBuilder();
- int numFilters = random.nextInt(5);
- for (int i = 0; i < numFilters; i++) {
-
- // Insert ValidatingTF after each stage so we can
- // catch problems right after the TF that "caused"
- // them:
- spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
-
- while (true) {
- final Constructor<? extends TokenFilter> ctor =
- tokenfilters.get(random.nextInt(tokenfilters.size()));
- if (random.nextBoolean()
- && avoidConditionals.contains(ctor.getDeclaringClass()) == false) {
- long seed = random.nextLong();
- spec.stream =
- new ConditionalTokenFilter(
- spec.stream,
- in -> {
- final Object[] args = newFilterArgs(random, in, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- return in;
- }
- TokenStream ts = createComponent(ctor, args, descr, true);
- if (ts == null) {
- return in;
- }
- return ts;
- }) {
- Random random = new Random(seed);
-
- @Override
- public void reset() throws IOException {
- super.reset();
- random = new Random(seed);
- }
-
- @Override
- protected boolean shouldFilter() throws IOException {
- return random.nextBoolean();
- }
- };
- break;
- } else {
- final Object[] args = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- continue;
- }
- final TokenFilter flt = createComponent(ctor, args, descr, false);
- if (flt != null) {
- spec.stream = flt;
- break;
- }
- }
- }
- }
-
- // Insert ValidatingTF after each stage so we can
- // catch problems right after the TF that "caused"
- // them:
- spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
-
- spec.toString = descr.toString();
- return spec;
- }
- }
-
- static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
- boolean readSomething;
-
- CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
- super(in);
- }
-
- @Override
- public int correct(int currentOff) {
- return currentOff; // we don't change any offsets
- }
-
- @Override
- public int read(char[] cbuf, int off, int len) throws IOException {
- readSomething = true;
- return input.read(cbuf, off, len);
- }
-
- @Override
- public int read() throws IOException {
- readSomething = true;
- return input.read();
- }
-
- @Override
- public int read(CharBuffer target) throws IOException {
- readSomething = true;
- return input.read(target);
- }
-
- @Override
- public int read(char[] cbuf) throws IOException {
- readSomething = true;
- return input.read(cbuf);
- }
-
- @Override
- public long skip(long n) throws IOException {
- readSomething = true;
- return input.skip(n);
- }
-
- @Override
- public void mark(int readAheadLimit) throws IOException {
- input.mark(readAheadLimit);
- }
-
- @Override
- public boolean markSupported() {
- return input.markSupported();
- }
-
- @Override
- public boolean ready() throws IOException {
- return input.ready();
- }
-
- @Override
- public void reset() throws IOException {
- input.reset();
- }
- }
-
- static class TokenizerSpec {
- Tokenizer tokenizer;
- String toString;
- }
-
- static class TokenFilterSpec {
- TokenStream stream;
- String toString;
- }
-
- static class CharFilterSpec {
- Reader reader;
- String toString;
- }
-
- public void testRandomChains() throws Throwable {
- int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
- Random random = random();
- for (int i = 0; i < numIterations; i++) {
- try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
- if (VERBOSE) {
- System.out.println("Creating random analyzer:" + a);
- }
- try {
- checkNormalize(a);
- checkRandomData(
- random,
- a,
- 500 * RANDOM_MULTIPLIER,
- 20,
- false,
- false /* We already validate our own offsets... */);
- } catch (Throwable e) {
- System.err.println("Exception from random analyzer: " + a);
- throw e;
- }
- }
- }
- }
-
- public void checkNormalize(Analyzer a) {
- // normalization should not modify characters that may be used for wildcards
- // or regular expressions
- String s = "([0-9]+)?*";
- assertEquals(s, a.normalize("dummy", s).utf8ToString());
- }
-
- // we might regret this decision...
- public void testRandomChainsWithLargeStrings() throws Throwable {
- int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
- Random random = random();
- for (int i = 0; i < numIterations; i++) {
- try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
- if (VERBOSE) {
- System.out.println("Creating random analyzer:" + a);
- }
- try {
- checkRandomData(
- random,
- a,
- 50 * RANDOM_MULTIPLIER,
- 80,
- false,
- false /* We already validate our own offsets... */);
- } catch (Throwable e) {
- System.err.println("Exception from random analyzer: " + a);
- throw e;
- }
- }
- }
- }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java
index 5285428..166b4b7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java
@@ -19,8 +19,8 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
/** Simple tests to ensure the keyword marker filter factory is working. */
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java
index 9e366bc..c581ab2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java
@@ -19,8 +19,8 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
/** Simple tests to ensure the stemmer override filter factory is working. */
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java
index 3700658..5436001 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java
@@ -19,10 +19,10 @@ package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.tests.analysis.CannedTokenStream;
import org.apache.lucene.tests.analysis.Token;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
/** This test just ensures the factory works */
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java
index b55542a..1340714 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java
@@ -19,8 +19,8 @@ package org.apache.lucene.analysis.snowball;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.EnglishStemmer;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java
index 7cd538c..c1024c3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestMultiWordSynonyms.java
@@ -19,8 +19,8 @@ package org.apache.lucene.analysis.synonym;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
/** @since solr 1.4 */
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
index 8df8e4b..b42c772 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
@@ -22,8 +22,8 @@ import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.pattern.PatternTokenizerFactory;
-import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
@Deprecated
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
index b82bde5..8f0ab4c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestFilesystemResourceLoader.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ResourceLoader;
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java
index 5fc3375..4a22cce 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term
@@ -54,6 +55,7 @@ import org.apache.lucene.util.CharsRefBuilder;
* WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link
* JapaneseCompletionAnalyzer}.
*/
+@IgnoreRandomChains(reason = "LUCENE-10363: fails with incorrect offsets")
public final class JapaneseCompletionFilter extends TokenFilter {
public static final Mode DEFAULT_MODE = Mode.INDEX;
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java
index c2350c7..2fdaffe 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.RollingCharBuffer;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
@@ -36,6 +37,8 @@ import org.apache.lucene.analysis.util.RollingCharBuffer;
* reached in order to not keep a copy of the character stream in memory. Vertical iteration marks,
* which are even rarer than horizontal iteration marks in contemporary Japanese, are unsupported.
*/
+@IgnoreRandomChains(
+ reason = "LUCENE-10358: fails with incorrect offsets or causes IndexOutOfBounds")
public class JapaneseIterationMarkCharFilter extends CharFilter {
/** Normalize kanji iteration marks by default */
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
index 18b5ee1..9198a17 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
@@ -45,6 +45,9 @@ public final class JapaneseKatakanaStemFilter extends TokenFilter {
public JapaneseKatakanaStemFilter(TokenStream input, int minimumLength) {
super(input);
+ if (minimumLength < 1) {
+ throw new IllegalArgumentException("minimumLength must be >=1");
+ }
this.minimumKatakanaLength = minimumLength;
}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
index 7043802..7b01751 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic decimal
@@ -82,6 +83,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
* <p>Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently not
* supported.
*/
+@IgnoreRandomChains(reason = "LUCENE-10362: fails with incorrect offsets")
public class JapaneseNumberFilter extends TokenFilter {
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
index bbc5ccf..47cb8d1 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@@ -41,6 +41,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
@@ -275,6 +276,7 @@ public final class JapaneseTokenizer extends Tokenizer {
* @param mode tokenization mode.
* @lucene.experimental
*/
+ @IgnoreRandomChains(reason = "Parameters are too complex to be tested")
public JapaneseTokenizer(
AttributeFactory factory,
TokenInfoDictionary systemDictionary,
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java
deleted file mode 100644
index d38acab..0000000
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import org.apache.lucene.util.ResourceLoader;
-
-/** Fake resource loader for tests: works if you want to fake reading a single file */
-class StringMockResourceLoader implements ResourceLoader {
- String text;
-
- public StringMockResourceLoader(String text) {
- this.text = text;
- }
-
- @Override
- public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
- try {
- return Class.forName(cname).asSubclass(expectedType);
- } catch (Exception e) {
- throw new RuntimeException("Cannot load class: " + cname, e);
- }
- }
-
- @Override
- public InputStream openResource(String resource) throws IOException {
- return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
- }
-}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
index ee6232f..b05d5ed 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilte
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.util.LuceneTestCase.Nightly;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.util.Version;
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java
index 470ce0c..8026495 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link JapaneseBaseFormFilterFactory} */
public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java
index 6ac6dcc..9e456a5 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.tests.analysis.MockTokenizer;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link JapaneseIterationMarkCharFilterFactory} */
public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java
index 53d8e9c..aee5f75 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link JapaneseKatakanaStemFilterFactory} */
public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
index 5b260d8..ef91bab 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory} */
public class TestJapaneseNumberFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
index c3d4038..02006ab 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.ClasspathResourceLoader;
import org.apache.lucene.util.Version;
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java
index e5ed23c..cd4eb04 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link JapaneseReadingFormFilterFactory} */
public class TestJapaneseReadingFormFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java
index 3c11270..fb11c4c 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link JapaneseTokenizerFactory} */
public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
index 648bb40..18a82ba 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
@@ -73,6 +74,7 @@ public class MorfologikFilter extends TokenFilter {
* @param in input token stream.
* @param dict Dictionary to use for stemming.
*/
+ @IgnoreRandomChains(reason = "No dictionary support yet")
public MorfologikFilter(final TokenStream in, final Dictionary dict) {
super(in);
this.input = in;
diff --git a/lucene/analysis/nori/src/java/module-info.java b/lucene/analysis/nori/src/java/module-info.java
index 9dd085b..77e6780 100644
--- a/lucene/analysis/nori/src/java/module-info.java
+++ b/lucene/analysis/nori/src/java/module-info.java
@@ -28,6 +28,7 @@ module org.apache.lucene.analysis.nori {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.ko.KoreanTokenizerFactory;
provides org.apache.lucene.analysis.TokenFilterFactory with
+ org.apache.lucene.analysis.ko.KoreanNumberFilterFactory,
org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory,
org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory;
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
index 61ef959..bc435aa 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* A {@link TokenFilter} that normalizes Korean numbers to regular Arabic decimal numbers in
@@ -72,6 +73,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
*
* @lucene.experimental
*/
+@IgnoreRandomChains(reason = "LUCENE-10361: KoreanNumberFilter messes up offsets")
public class KoreanNumberFilter extends TokenFilter {
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index 0765b80..325fae7 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -40,6 +40,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IgnoreRandomChains;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
@@ -59,6 +60,7 @@ import org.apache.lucene.util.fst.FST;
*
* @lucene.experimental
*/
+@IgnoreRandomChains(reason = "LUCENE-10359: fails with incorrect offsets")
public final class KoreanTokenizer extends Tokenizer {
/** Token type reflecting the original source of this token */
@@ -205,6 +207,7 @@ public final class KoreanTokenizer extends Tokenizer {
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @lucene.experimental
*/
+ @IgnoreRandomChains(reason = "Parameters are too complex to be tested")
public KoreanTokenizer(
AttributeFactory factory,
TokenInfoDictionary systemDictionary,
diff --git a/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index 4fff753..cf903c1 100644
--- a/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/nori/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -13,5 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.lucene.analysis.ko.KoreanNumberFilterFactory
org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilterFactory
org.apache.lucene.analysis.ko.KoreanReadingFormFilterFactory
\ No newline at end of file
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java
deleted file mode 100644
index e29bfbe..0000000
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ko;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import org.apache.lucene.util.ResourceLoader;
-
-/** Fake resource loader for tests: works if you want to fake reading a single file */
-class StringMockResourceLoader implements ResourceLoader {
- private String text;
-
- public StringMockResourceLoader(String text) {
- this.text = text;
- }
-
- @Override
- public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
- try {
- return Class.forName(cname).asSubclass(expectedType);
- } catch (Exception e) {
- throw new RuntimeException("Cannot load class: " + cname, e);
- }
- }
-
- @Override
- public InputStream openResource(String resource) throws IOException {
- return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
- }
-}
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
index 2a519e8..9dc244a 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link org.apache.lucene.analysis.ko.KoreanNumberFilterFactory} */
public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
index 68fd7fa..5a6c31d 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
import org.apache.lucene.util.Version;
/** Simple tests for {@link KoreanPartOfSpeechStopFilterFactory} */
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
index 46b910c..a92aab1 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
@@ -21,6 +21,7 @@ import java.io.StringReader;
import java.util.HashMap;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link KoreanReadingFormFilterFactory} */
public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
index 93bd20d..63847cb 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
@@ -23,6 +23,7 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
/** Simple tests for {@link KoreanTokenizerFactory} */
public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
index 93d0c11..0093227 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -27,12 +27,14 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Run OpenNLP chunker. Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this
* filter. Tags terms in the TypeAttribute, replacing the POS tags previously put there by
* OpenNLPPOSFilter.
*/
+@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
public final class OpenNLPChunkerFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
index 1e8e1d1..af14f03 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers.
@@ -41,6 +42,7 @@ import org.apache.lucene.util.AttributeSource;
* <p>The dictionary file must be encoded as UTF-8, with one entry per line, in the form <code>
* word[tab]lemma[tab]part-of-speech</code>
*/
+@IgnoreRandomChains(reason = "LUCENE-10352: no dictionary support yet")
public class OpenNLPLemmatizerFilter extends TokenFilter {
private final NLPLemmatizerOp lemmatizerOp;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
index f9c7bdd..2cb3ab5 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -27,8 +27,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
/** Run OpenNLP POS tagger. Tags all terms in the TypeAttribute. */
+@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPPOSFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
index 134fa25..c31f5c1 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -26,12 +26,14 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
* the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
* apply operations to tokens one sentence at a time.
*/
+@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
public static int EOS_FLAG_BIT = 1;
diff --git a/lucene/analysis/phonetic/build.gradle b/lucene/analysis/phonetic/build.gradle
index e5595cb..2297af5 100644
--- a/lucene/analysis/phonetic/build.gradle
+++ b/lucene/analysis/phonetic/build.gradle
@@ -23,7 +23,7 @@ dependencies {
moduleApi project(':lucene:core')
moduleApi project(':lucene:analysis:common')
- moduleImplementation 'commons-codec:commons-codec'
+ moduleApi 'commons-codec:commons-codec'
testImplementation project(':lucene:test-framework')
}
diff --git a/lucene/analysis/phonetic/src/java/module-info.java b/lucene/analysis/phonetic/src/java/module-info.java
index 706251a..9bf5e64 100644
--- a/lucene/analysis/phonetic/src/java/module-info.java
+++ b/lucene/analysis/phonetic/src/java/module-info.java
@@ -26,6 +26,7 @@ module org.apache.lucene.analysis.phonetic {
provides org.apache.lucene.analysis.TokenFilterFactory with
org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory,
+ org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilterFactory,
org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory,
org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
}
diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
index 5e16e47..aa0dc1a 100644
--- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
+++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* TokenFilter for Beider-Morse phonetic encoding.
@@ -33,6 +34,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* @see BeiderMorseEncoder
* @lucene.experimental
*/
+@IgnoreRandomChains(
+ reason = "LUCENE-10360: cannot handle empty tokens (or those only dashes and whitespace)")
public final class BeiderMorseFilter extends TokenFilter {
private final PhoneticEngine engine;
private final LanguageSet languages;
@@ -72,6 +75,7 @@ public final class BeiderMorseFilter extends TokenFilter {
* @param languages optional Set of original languages. Can be null (which means it will be
* guessed).
*/
+ @IgnoreRandomChains(reason = "LUCENE-10352: Add support for LanguageSet randomization")
public BeiderMorseFilter(TokenStream input, PhoneticEngine engine, LanguageSet languages) {
super(input);
this.engine = engine;
diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java
index e1f267a..6a950d8 100644
--- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java
+++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java
@@ -39,6 +39,9 @@ public final class DoubleMetaphoneFilter extends TokenFilter {
*/
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
+ if (maxCodeLength < 1) {
+ throw new IllegalArgumentException("maxCodeLength must be >=1");
+ }
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
}
diff --git a/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index fe78873..677ae48 100644
--- a/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -14,5 +14,6 @@
# limitations under the License.
org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory
+org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilterFactory
org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory
org.apache.lucene.analysis.phonetic.PhoneticFilterFactory
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
index f87ee88..d3a6c24 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.IgnoreRandomChains;
/**
* This class can be used if the token attributes of a TokenStream are intended to be consumed more
@@ -31,6 +32,9 @@ import org.apache.lucene.util.AttributeSource;
* although only before {@link #incrementToken()} is called the first time. Prior to Lucene 5, it
* was never propagated.
*/
+@IgnoreRandomChains(
+ reason =
+ "doesn't actual reset itself! TODO: this statement is probably obsolete as of LUCENE-6121")
public final class CachingTokenFilter extends TokenFilter {
private List<AttributeSource.State> cache = null;
private Iterator<AttributeSource.State> iterator = null;
diff --git a/lucene/analysis/phonetic/build.gradle b/lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java
similarity index 60%
copy from lucene/analysis/phonetic/build.gradle
copy to lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java
index e5595cb..f6f4c2a 100644
--- a/lucene/analysis/phonetic/build.gradle
+++ b/lucene/core/src/java/org/apache/lucene/util/IgnoreRandomChains.java
@@ -14,17 +14,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.util;
-apply plugin: 'java-library'
-
-description = 'Analyzer for indexing phonetic signatures (for sounds-alike search)'
-
-dependencies {
- moduleApi project(':lucene:core')
- moduleApi project(':lucene:analysis:common')
-
- moduleImplementation 'commons-codec:commons-codec'
-
- testImplementation project(':lucene:test-framework')
-}
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+/**
+ * Annotation to not test a class or constructor with {@code TestRandomChains} integration test.
+ *
+ * @lucene.internal
+ */
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.CONSTRUCTOR, ElementType.TYPE})
+public @interface IgnoreRandomChains {
+ /** A reason for ignoring should always be given. */
+ String reason();
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java
similarity index 97%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java
rename to lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java
index 87764d6..d708ac7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/StringMockResourceLoader.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.analysis.util;
+package org.apache.lucene.tests.util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
diff --git a/settings.gradle b/settings.gradle
index ed641bf..0923e9d 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -36,6 +36,7 @@ include "lucene:analysis:opennlp"
include "lucene:analysis:phonetic"
include "lucene:analysis:smartcn"
include "lucene:analysis:stempel"
+include "lucene:analysis.tests"
include "lucene:backward-codecs"
include "lucene:benchmark"
include "lucene:classification"