You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/08 09:50:45 UTC
[lucene-solr] branch master updated: LUCENE-9727: build side
support for running Hunspell tests. (#2313)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 903782d LUCENE-9727: build side support for running Hunspell tests. (#2313)
903782d is described below
commit 903782d75656c63de018af6c102b14dd2cf2748d
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Mon Feb 8 10:50:25 2021 +0100
LUCENE-9727: build side support for running Hunspell tests. (#2313)
---
gradle/testing/randomization/policies/tests.policy | 6 +-
lucene/analysis/common/build.gradle | 15 ++
.../analysis/hunspell/TestAllDictionaries.java | 173 ++++++++++++++++-----
.../hunspell/TestHunspellRepositoryTestCases.java | 2 +-
.../lucene/analysis/hunspell/TestPerformance.java | 24 ++-
.../org/apache/lucene/util/RamUsageEstimator.java | 17 +-
.../org/apache/lucene/util/RamUsageTester.java | 22 ++-
7 files changed, 198 insertions(+), 61 deletions(-)
diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
index e17af8e..469892c 100644
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@@ -91,10 +91,12 @@ grant {
// allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
permission java.security.SecurityPermission "createAccessControlContext";
- // Some Hunspell tests may read from external files specified in system properties
+ // Hunspell regression and validation tests can read from external files
+ // specified in system properties.
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
- permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
+ permission java.io.FilePermission "${hunspell.dictionaries}", "read";
+ permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
};
// Permissions to support ant build
diff --git a/lucene/analysis/common/build.gradle b/lucene/analysis/common/build.gradle
index a44152c..24c949f 100644
--- a/lucene/analysis/common/build.gradle
+++ b/lucene/analysis/common/build.gradle
@@ -23,3 +23,18 @@ dependencies {
api project(':lucene:core')
testImplementation project(':lucene:test-framework')
}
+
+// Pass all hunspell-tests-specific project properties to tests as system properties.
+tasks.withType(Test) {
+ [
+ "hunspell.dictionaries",
+ "hunspell.corpora",
+ "hunspell.repo.path"
+ ].each {
+ def val = propertyOrDefault(it, null)
+ if (val != null) {
+ logger.lifecycle("Passing property: ${it}=${val}")
+ systemProperty it, val
+ }
+ }
+}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index 886272c..c267663 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -16,35 +16,52 @@
*/
package org.apache.lucene.analysis.hunspell;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.RamUsageTester;
import org.junit.Assume;
import org.junit.Ignore;
/**
- * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
- * prints their memory usage. All *.aff files are traversed directly inside the given directory or
- * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
- * examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
- * documentation}
+ * Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system
+ * property and prints their memory usage. All *.aff files are traversed recursively inside the
+ * given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
+ * directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
*/
-@Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase {
-
static Stream<Path> findAllAffixFiles() throws IOException {
String dicDir = System.getProperty("hunspell.dictionaries");
- Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
- return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
+ Assume.assumeFalse(
+ "Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null);
+ Path dicPath = Paths.get(dicDir);
+ return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted();
}
static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
@@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase {
}
}
- public void testDictionariesLoadSuccessfully() throws Exception {
- int failures = 0;
+ /** Hack bais to expose current position. */
+ private static class ExposePosition extends ByteArrayInputStream {
+ public ExposePosition(byte[] buf) {
+ super(buf);
+ }
+
+ public long position() {
+ return super.pos;
+ }
+ }
+
+ @Ignore
+ public void testMaxPrologueNeeded() throws Exception {
+ AtomicBoolean failTest = new AtomicBoolean();
+
+ Map<String, List<Long>> global = new LinkedHashMap<>();
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
- try {
- System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
- } catch (Throwable e) {
- failures++;
- System.err.println("While checking " + aff + ":");
- e.printStackTrace();
+ Map<String, List<Long>> local = new LinkedHashMap<>();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) {
+ int chr;
+ while ((chr = is.read()) >= 0) {
+ baos.write(chr);
+
+ if (chr == '\n') {
+ String line = baos.toString(StandardCharsets.ISO_8859_1);
+ if (!line.isBlank()) {
+ String firstWord = line.split("\\s")[0];
+ switch (firstWord) {
+ case "SET":
+ case "FLAG":
+ local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+ global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+ break;
+ }
+ }
+
+ baos.reset();
+ }
+ }
+ }
+
+ local.forEach(
+ (flag, positions) -> {
+ if (positions.size() > 1) {
+ System.out.format(
+ Locale.ROOT,
+ "Flag %s at more than one position in %s: %s%n",
+ flag,
+ aff,
+ positions);
+ failTest.set(true);
+ }
+ });
+ }
+
+ global.forEach(
+ (flag, positions) -> {
+ long max = positions.stream().mapToLong(v -> v).max().orElse(0);
+ System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
+ });
+
+ if (failTest.get()) {
+ throw new AssertionError("Duplicate flags were present in at least one .aff file.");
+ }
+ }
+
+ public void testDictionariesLoadSuccessfully() throws Exception {
+ int threads = Runtime.getRuntime().availableProcessors();
+ ExecutorService executor =
+ Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-"));
+ try {
+ List<Path> failures = Collections.synchronizedList(new ArrayList<>());
+ Function<Path, Void> process =
+ (Path aff) -> {
+ try {
+ System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
+ } catch (Throwable e) {
+ failures.add(aff);
+ System.err.println("While checking " + aff + ":");
+ e.printStackTrace();
+ }
+ return null;
+ };
+
+ for (Future<?> future :
+ executor.invokeAll(
+ findAllAffixFiles()
+ .map(aff -> (Callable<?>) () -> process.apply(aff))
+ .collect(Collectors.toList()))) {
+ future.get();
+ }
+
+ if (!failures.isEmpty()) {
+ throw new AssertionError(
+ "Certain dictionaries failed to parse:\n - "
+ + failures.stream()
+ .map(path -> path.toAbsolutePath().toString())
+ .collect(Collectors.joining("\n - ")));
}
+ } finally {
+ executor.shutdown();
+ executor.awaitTermination(1, TimeUnit.MINUTES);
}
- assertEquals(failures + " failures!", 0, failures);
}
private static String memoryUsage(Dictionary dic) {
return RamUsageTester.humanSizeOf(dic)
+ "\t("
- + "words="
- + RamUsageTester.humanSizeOf(dic.words)
- + ", "
- + "flags="
- + RamUsageTester.humanSizeOf(dic.flagLookup)
- + ", "
- + "strips="
- + RamUsageTester.humanSizeOf(dic.stripData)
- + ", "
- + "conditions="
- + RamUsageTester.humanSizeOf(dic.patterns)
- + ", "
- + "affixData="
- + RamUsageTester.humanSizeOf(dic.affixData)
- + ", "
- + "prefixes="
- + RamUsageTester.humanSizeOf(dic.prefixes)
- + ", "
- + "suffixes="
- + RamUsageTester.humanSizeOf(dic.suffixes)
- + ")";
+ + ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ")
+ + ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ")
+ + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
+ + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
+ + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
+ + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
+ + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
index 048dc04..004ef73 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
@@ -32,7 +32,7 @@ import org.junit.runners.Parameterized;
/**
* Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
- * out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
+ * out Hunspell repository should be in {@code hunspell.repo.path} system property.
*/
@RunWith(Parameterized.class)
public class TestHunspellRepositoryTestCases {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 33da1ca..e26cae7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -24,13 +24,15 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
-import org.junit.Ignore;
+import org.junit.AssumptionViolatedException;
+import org.junit.BeforeClass;
import org.junit.Test;
/**
@@ -40,8 +42,15 @@ import org.junit.Test;
* en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
*/
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
-@Ignore("enable manually")
public class TestPerformance extends LuceneTestCase {
+ private static Path corporaDir;
+
+ @BeforeClass
+ public static void resolveCorpora() {
+ String dir = System.getProperty("hunspell.corpora");
+ Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null);
+ corporaDir = Paths.get(dir);
+ }
@Test
public void en() throws Exception {
@@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase {
private void checkPerformance(String code, int wordCount) throws Exception {
Path aff = findAffFile(code);
+
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
System.out.println("Loaded " + aff);
@@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase {
return code.equals(Dictionary.extractLanguageCode(parentName));
})
.findFirst()
- .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
+ .orElseThrow(
+ () -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code));
}
private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
throws IOException {
- String corpusDir = System.getProperty("hunspell.corpora");
- Assume.assumeFalse("", corpusDir == null);
+ Path dataPath = corporaDir.resolve(code + ".txt");
+ if (!Files.isReadable(dataPath)) {
+ throw new AssumptionViolatedException("Missing text corpora at: " + dataPath);
+ }
- Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
List<String> words = new ArrayList<>();
try (InputStream stream = Files.newInputStream(dataPath)) {
BufferedReader reader =
diff --git a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
index 7140546..6c9a75b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
@@ -20,6 +20,7 @@ import java.lang.reflect.Array;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
+import java.security.AccessControlException;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.text.DecimalFormat;
@@ -527,14 +528,14 @@ public final class RamUsageEstimator {
// Walk type hierarchy
for (; clazz != null; clazz = clazz.getSuperclass()) {
final Class<?> target = clazz;
- final Field[] fields =
- AccessController.doPrivileged(
- new PrivilegedAction<Field[]>() {
- @Override
- public Field[] run() {
- return target.getDeclaredFields();
- }
- });
+ final Field[] fields;
+ try {
+ fields =
+ AccessController.doPrivileged((PrivilegedAction<Field[]>) target::getDeclaredFields);
+ } catch (AccessControlException e) {
+ throw new RuntimeException("Can't access fields of class: " + target, e);
+ }
+
for (Field f : fields) {
if (!Modifier.isStatic(f.getModifiers())) {
size = adjustForField(size, f);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
index 39d2556..2c5c6af 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
@@ -23,11 +23,14 @@ import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
import java.nio.file.Path;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.AbstractList;
import java.util.ArrayList;
+import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -151,6 +154,14 @@ public final class RamUsageTester {
ArrayList<Object> stack,
Object ob,
Class<?> obClazz) {
+
+ // Ignore JDK objects we can't access or handle properly.
+ Predicate<Object> isIgnorable =
+ (clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder);
+ if (isIgnorable.test(ob)) {
+ return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack);
+ }
+
/*
* Consider an object. Push any references it has to the processing stack
* and accumulate this object's shallow size.
@@ -159,10 +170,7 @@ public final class RamUsageTester {
if (Constants.JRE_IS_MINIMUM_JAVA9) {
long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob);
- Predicate<Class<?>> isJavaModule =
- (clazz) -> {
- return clazz.getName().startsWith("java.");
- };
+ Predicate<Class<?>> isJavaModule = (clazz) -> clazz.getName().startsWith("java.");
// Java 9: Best guess for some known types, as we cannot precisely look into runtime
// classes:
@@ -274,13 +282,17 @@ public final class RamUsageTester {
v.length())); // may not be correct with Java 9's compact strings!
a(StringBuilder.class, v -> charArraySize(v.capacity()));
a(StringBuffer.class, v -> charArraySize(v.capacity()));
+ // Approximate the underlying long[] buffer.
+ a(BitSet.class, v -> (v.size() / Byte.SIZE));
// Types with large buffers:
a(ByteArrayOutputStream.class, v -> byteArraySize(v.size()));
// For File and Path, we just take the length of String representation as
// approximation:
a(File.class, v -> charArraySize(v.toString().length()));
a(Path.class, v -> charArraySize(v.toString().length()));
- a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants
+
+ // Ignorable JDK classes.
+ a(ByteOrder.class, v -> 0);
}
@SuppressWarnings("unchecked")