You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/26 19:28:34 UTC
[3/4] tika git commit: TIKA-2041 -- add unit test in HTMLParserTest
TIKA-2041 -- add unit test in HTMLParserTest
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7dc5c671
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7dc5c671
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7dc5c671
Branch: refs/heads/master
Commit: 7dc5c671f892bac79d8fb2d55e9e94718ca31cbe
Parents: d698d49
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 15:28:09 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 15:28:09 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../apache/tika/parser/html/HtmlParserTest.java | 107 ++++++++++++++++++-
2 files changed, 108 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7dc5c671/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bef9d99..a8a17c5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.14 - ???
+ * Upgrade ICU4J charset detection components to fix multithreading
+ bug (TIKA-2041).
+
* Upgrade to Jackcess 2.1.4 (TIKA-2039).
* Maintain more significant digits in cells of "General" format
http://git-wip-us.apache.org/repos/asf/tika/blob/7dc5c671/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index d37394e..8599e5a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser.html;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
@@ -30,18 +29,35 @@ import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayInputStream;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -59,7 +75,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.helpers.XMLReaderAdapter;
public class HtmlParserTest extends TikaTest {
@@ -1169,4 +1184,92 @@ public class HtmlParserTest extends TikaTest {
XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
}
+
+ @Test
+ public void testMultiThreadingEncodingDetection() throws Exception {
+ List<EncodingDetector> detectors = new ArrayList<>();
+ ServiceLoader loader =
+ new ServiceLoader(AutoDetectReader.class.getClassLoader());
+ detectors.addAll(loader.loadServiceProviders(EncodingDetector.class));
+ for (EncodingDetector detector : detectors) {
+ testDetector(detector);
+ }
+ }
+
+ private void testDetector(EncodingDetector detector) throws Exception {
+ Path testDocs = Paths.get(this.getClass().getResource("/test-documents").toURI());
+ List<Path> tmp = new ArrayList<>();
+ Map<Path, String> encodings = new ConcurrentHashMap<>();
+ File[] testDocArray = testDocs.toFile().listFiles();
+ assertNotNull("no test docs??", testDocArray);
+ for (File file : testDocArray) {
+ if (file.getName().endsWith(".txt") || file.getName().endsWith(".html")) {
+ String encoding = getEncoding(detector, file.toPath());
+ tmp.add(file.toPath());
+ encodings.put(file.toPath(), encoding);
+ }
+ }
+ ArrayBlockingQueue<Path> paths = new ArrayBlockingQueue<>(tmp.size());
+ paths.addAll(tmp);
+ int numThreads = paths.size()+1;
+ ExecutorService ex = Executors.newFixedThreadPool(numThreads);
+ CompletionService<String> completionService =
+ new ExecutorCompletionService<>(ex);
+
+ for (int i = 0; i < numThreads; i++) {
+ completionService.submit(new EncodingDetectorRunner(paths, encodings, detector));
+ }
+ int completed = 0;
+ while (completed < numThreads) {
+ Future<String> future = completionService.take();
+
+ if (future.isDone() &&
+ //will trigger ExecutionException if an IOException
+ //was thrown during call
+ EncodingDetectorRunner.DONE.equals(future.get())) {
+ completed++;
+ }
+ }
+ }
+
+ private class EncodingDetectorRunner implements Callable<String> {
+
+ final static String DONE = "done";
+ private final ArrayBlockingQueue<Path> paths;
+ private final Map<Path, String> encodings;
+ private final EncodingDetector detector;
+ private EncodingDetectorRunner(ArrayBlockingQueue<Path> paths,
+ Map<Path, String> encodings, EncodingDetector detector) {
+ this.paths = paths;
+ this.encodings = encodings;
+ this.detector = detector;
+ }
+
+ @Override
+ public String call() throws IOException {
+ for (int i = 0; i < encodings.size(); i++) {
+ Path p = paths.poll();
+ if (p == null) {
+ return DONE;
+ }
+ String detectedEncoding = getEncoding(detector, p);
+ String trueEncoding = encodings.get(p);
+ assertEquals( "detector class="+detector.getClass() + " : file=" + p.toString(),
+ trueEncoding, detectedEncoding);
+
+ }
+ return DONE;
+ }
+ }
+
+ public String getEncoding(EncodingDetector detector, Path p) throws IOException {
+ try (InputStream is = TikaInputStream.get(p)) {
+ Charset charset = detector.detect(is, new Metadata());
+ if (charset == null) {
+ return "NULL";
+ } else {
+ return charset.toString();
+ }
+ }
+ }
}