You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "Tim Allison (JIRA)" <ji...@apache.org> on 2016/07/26 02:52:21 UTC
[jira] [Created] (TIKA-2041) Charset detection doesn't appear to be
thread-safe
Tim Allison created TIKA-2041:
---------------------------------
Summary: Charset detection doesn't appear to be thread-safe
Key: TIKA-2041
URL: https://issues.apache.org/jira/browse/TIKA-2041
Project: Tika
Issue Type: Bug
Reporter: Tim Allison
On the user list, Christian Leitinger noted that his team found a potential issue with the thread safety of the encoding detector. I was able to reproduce this with on the corpus of html files in [~faghani]'s encoding detector.
{noformat}
@Test
public void testMultiThreadingEncodingDetection() throws Exception {
Path testDocs = Paths.get("C:/data/encodings/corpus");
List<Path> paths = new ArrayList<>();
Map<Path, String> encodings = new ConcurrentHashMap<>();
for (File encodingDirs : testDocs.toFile().listFiles()) {
for (File file : encodingDirs.listFiles()) {
String encoding = getEncoding(file.toPath());
paths.add(file.toPath());
encodings.put(file.toPath(), encoding);
}
}
int numThreads = 1000;
ExecutorService ex = Executors.newFixedThreadPool(numThreads);
CompletionService<String> completionService =
new ExecutorCompletionService<>(ex);
for (int i = 0; i < numThreads; i++) {
completionService.submit(new EncodingDetectorRunner(paths, encodings), "done");
}
int completed = 0;
while (completed < numThreads) {
Future<String> future = completionService.take();
if (future.isDone() && "done".equals(future.get())) {
completed++;
}
}
assertTrue("success!", true);
}
private class EncodingDetectorRunner implements Runnable {
private final List<Path> paths;
private final Map<Path, String> encodings;
private final Random r = new Random();
private EncodingDetectorRunner(List<Path> paths, Map<Path, String> encodings) {
this.paths = paths;
this.encodings = encodings;
}
@Override
public void run() {
for (int i = 0; i < 100; i++) {
int pInd = r.nextInt(paths.size());
String detectedEncoding = null;
try {
detectedEncoding = getEncoding(paths.get(pInd));
} catch (Exception e) {
throw new RuntimeException(e);
}
String trueEncoding = encodings.get(paths.get(pInd));
if (! detectedEncoding.equals(trueEncoding)) {
throw new RuntimeException("detected: " + detectedEncoding +
" but should have been: "+trueEncoding + " for " + paths.get(pInd));
}
}
}
}
public String getEncoding(Path p) throws Exception {
try (InputStream is = TikaInputStream.get(p)) {
AutoDetectReader reader = new AutoDetectReader(is);
String val = reader.getCharset().toString();
if (val == null) {
return "NULL";
} else {
return val;
}
}
}
{noformat}
yields:
{noformat}
ava.util.concurrent.ExecutionException: java.lang.RuntimeException: detected: ISO-8859-1 but should have been: windows-1252 for C:\data\encodings\corpus\Shift_JIS\1
at java.util.concurrent.FutureTask.report(FutureTask.java:122)
at java.util.concurrent.FutureTask.get(FutureTask.java:192)
at org.apache.tika.parser.html.HtmlParserTest.testMultiThreadingEncodingDetection(HtmlParserTest.java:1213)
{noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)