You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/26 19:28:34 UTC

[3/4] tika git commit: TIKA-2041 -- add unit test in HTMLParserTest

TIKA-2041 -- add unit test in HTMLParserTest


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7dc5c671
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7dc5c671
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7dc5c671

Branch: refs/heads/master
Commit: 7dc5c671f892bac79d8fb2d55e9e94718ca31cbe
Parents: d698d49
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 15:28:09 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 15:28:09 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../apache/tika/parser/html/HtmlParserTest.java | 107 ++++++++++++++++++-
 2 files changed, 108 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7dc5c671/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bef9d99..a8a17c5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.14 - ???
 
+  * Upgrade ICU4J charset detection components to fix multithreading
+    bug (TIKA-2041).
+
   * Upgrade to Jackcess 2.1.4 (TIKA-2039).
 
   * Maintain more significant digits in cells of "General" format

http://git-wip-us.apache.org/repos/asf/tika/blob/7dc5c671/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index d37394e..8599e5a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser.html;
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.US_ASCII;
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertContains;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
@@ -30,18 +29,35 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.io.Writer;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.regex.Pattern;
 
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -59,7 +75,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.helpers.XMLReaderAdapter;
 
 public class HtmlParserTest extends TikaTest {
 
@@ -1169,4 +1184,92 @@ public class HtmlParserTest extends TikaTest {
         XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
         assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
     }
+
+    @Test
+    public void testMultiThreadingEncodingDetection() throws Exception {
+        List<EncodingDetector> detectors = new ArrayList<>();
+        ServiceLoader loader =
+                new ServiceLoader(AutoDetectReader.class.getClassLoader());
+        detectors.addAll(loader.loadServiceProviders(EncodingDetector.class));
+        for (EncodingDetector detector : detectors) {
+            testDetector(detector);
+        }
+    }
+
+    private void testDetector(EncodingDetector detector) throws Exception {
+        Path testDocs = Paths.get(this.getClass().getResource("/test-documents").toURI());
+        List<Path> tmp = new ArrayList<>();
+        Map<Path, String> encodings = new ConcurrentHashMap<>();
+        File[] testDocArray = testDocs.toFile().listFiles();
+        assertNotNull("no test docs??", testDocArray);
+        for (File file : testDocArray) {
+            if (file.getName().endsWith(".txt") || file.getName().endsWith(".html")) {
+                    String encoding = getEncoding(detector, file.toPath());
+                    tmp.add(file.toPath());
+                    encodings.put(file.toPath(), encoding);
+            }
+        }
+        ArrayBlockingQueue<Path> paths = new ArrayBlockingQueue<>(tmp.size());
+        paths.addAll(tmp);
+        int numThreads = paths.size()+1;
+        ExecutorService ex = Executors.newFixedThreadPool(numThreads);
+        CompletionService<String> completionService =
+                new ExecutorCompletionService<>(ex);
+
+        for (int i = 0; i < numThreads; i++) {
+            completionService.submit(new EncodingDetectorRunner(paths, encodings, detector));
+        }
+        int completed = 0;
+        while (completed < numThreads) {
+            Future<String> future = completionService.take();
+
+            if (future.isDone() &&
+                    //will trigger ExecutionException if an IOException
+                    //was thrown during call
+                    EncodingDetectorRunner.DONE.equals(future.get())) {
+                completed++;
+            }
+        }
+    }
+
+    private class EncodingDetectorRunner implements Callable<String> {
+
+        final static String DONE = "done";
+        private final ArrayBlockingQueue<Path> paths;
+        private final Map<Path, String> encodings;
+        private final EncodingDetector detector;
+        private EncodingDetectorRunner(ArrayBlockingQueue<Path> paths,
+                                       Map<Path, String> encodings, EncodingDetector detector) {
+            this.paths = paths;
+            this.encodings = encodings;
+            this.detector = detector;
+        }
+
+        @Override
+        public String call() throws IOException {
+            for (int i = 0; i < encodings.size(); i++) {
+                Path p = paths.poll();
+                if (p == null) {
+                    return DONE;
+                }
+                String detectedEncoding = getEncoding(detector, p);
+                String trueEncoding = encodings.get(p);
+                assertEquals( "detector class="+detector.getClass() + " : file=" + p.toString(),
+                        trueEncoding, detectedEncoding);
+
+            }
+            return DONE;
+        }
+    }
+
+    public String getEncoding(EncodingDetector detector, Path p) throws IOException {
+        try (InputStream is = TikaInputStream.get(p)) {
+            Charset charset = detector.detect(is, new Metadata());
+            if (charset == null) {
+                return "NULL";
+            } else {
+                return charset.toString();
+            }
+        }
+    }
 }