You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "Tim Allison (JIRA)" <ji...@apache.org> on 2016/07/26 02:52:21 UTC

[jira] [Created] (TIKA-2041) Charset detection doesn't appear to be thread-safe

Tim Allison created TIKA-2041:
---------------------------------

             Summary: Charset detection doesn't appear to be thread-safe
                 Key: TIKA-2041
                 URL: https://issues.apache.org/jira/browse/TIKA-2041
             Project: Tika
          Issue Type: Bug
            Reporter: Tim Allison


On the user list, Christian Leitinger noted that his team found a potential issue with the thread safety of the encoding detector.  I was able to reproduce this with on the corpus of html files in [~faghani]'s encoding detector.

{noformat}
    @Test
    public void testMultiThreadingEncodingDetection() throws Exception {

        Path testDocs = Paths.get("C:/data/encodings/corpus");
        List<Path> paths = new ArrayList<>();
        Map<Path, String> encodings = new ConcurrentHashMap<>();
        for (File encodingDirs : testDocs.toFile().listFiles()) {
            for (File file : encodingDirs.listFiles()) {
                    String encoding = getEncoding(file.toPath());
                    paths.add(file.toPath());
                    encodings.put(file.toPath(), encoding);
            }
        }
        int numThreads = 1000;
        ExecutorService ex = Executors.newFixedThreadPool(numThreads);
        CompletionService<String> completionService =
                new ExecutorCompletionService<>(ex);

        for (int i = 0; i < numThreads; i++) {
            completionService.submit(new EncodingDetectorRunner(paths, encodings), "done");
        }
        int completed = 0;
        while (completed < numThreads) {
            Future<String> future = completionService.take();
            if (future.isDone() && "done".equals(future.get())) {
                completed++;
            }
        }
        assertTrue("success!", true);
    }

    private class EncodingDetectorRunner implements Runnable {
        private final List<Path> paths;
        private final Map<Path, String> encodings;
        private final Random r = new Random();
        private EncodingDetectorRunner(List<Path> paths, Map<Path, String> encodings) {
            this.paths = paths;
            this.encodings = encodings;
        }

        @Override
        public void run() {
            for (int i = 0; i < 100; i++) {
                int pInd = r.nextInt(paths.size());

                String detectedEncoding = null;
                try {
                    detectedEncoding = getEncoding(paths.get(pInd));
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
                String trueEncoding = encodings.get(paths.get(pInd));
                if (! detectedEncoding.equals(trueEncoding)) {
                    throw new RuntimeException("detected: " + detectedEncoding +
                            " but should have been: "+trueEncoding + " for " + paths.get(pInd));
                }
            }
        }
    }

    public String getEncoding(Path p) throws Exception {
        try (InputStream is = TikaInputStream.get(p)) {
            AutoDetectReader reader = new AutoDetectReader(is);
            String val = reader.getCharset().toString();
            if (val == null) {
                return "NULL";
            } else {
                return val;
            }
        }
    }
{noformat}

yields:

{noformat}
ava.util.concurrent.ExecutionException: java.lang.RuntimeException: detected: ISO-8859-1 but should have been: windows-1252 for C:\data\encodings\corpus\Shift_JIS\1

	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:192)
	at org.apache.tika.parser.html.HtmlParserTest.testMultiThreadingEncodingDetection(HtmlParserTest.java:1213)
{noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)