You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/10/11 13:19:03 UTC
[tika] 03/03: TIKA-2475 mods and some new tests/cleanup for
CharsetDetector. This closes #210.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 94850f2e7c7d3df6a06a924fc6d643c0f6181643
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Oct 11 09:18:47 2017 -0400
TIKA-2475 mods and some new tests/cleanup for CharsetDetector.
This closes #210.
---
CHANGES.txt | 4 ++
.../apache/tika/parser/txt/CharsetDetector.java | 32 ++++++++++------
.../tika/parser/txt/CharsetDetectorTest.java | 44 +++++++++++++++++++---
3 files changed, 63 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index b6eb6ab..d9d0eb7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.17 - ???
+ * Fix bug in CharsetDetector that led to different detected charsets
+ depending on whether user setText with a byte[] or an InputStream
+ via Sean Story (TIKA-2475).
+
* Remove JAXB for easier use with Java 9 via Robert Munteanu (TIKA-2466).
* Upgrade to POI 3.17 (TIKA-2429).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index cee7090..172cb46 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -8,6 +8,8 @@
*/
package org.apache.tika.parser.txt;
+import org.apache.poi.util.IOUtils;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@@ -190,8 +192,12 @@ public class CharsetDetector {
* @stable ICU 3.4
*/
public CharsetDetector setText(byte[] in) {
+ return setText(in, in.length);
+ }
+
+ private CharsetDetector setText(byte[] in, int length) {
fRawInput = in;
- fRawLength = in.length;
+ fRawLength = length;
MungeInput();
return this;
@@ -217,19 +223,23 @@ public class CharsetDetector {
byte[] inputBytes = new byte[kBufSize]; // Always make a new buffer because the
// previous one may have come from the caller,
// in which case we can't touch it.
- int length = 0;
- int remainingLength = kBufSize;
- while (remainingLength > 0) {
- // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
- int bytesRead = fInputStream.read(inputBytes, length, remainingLength);
- if (bytesRead <= 0) {
- break;
+ long bytesRead = -1;
+ try {
+ bytesRead = IOUtils.readFully(fInputStream, inputBytes);
+ if (bytesRead >= Integer.MAX_VALUE) {
+ throw new IOException("Can't have read > Integer.MAX_VALUE bytes");
}
- remainingLength -= bytesRead;
+ } finally {
+ fInputStream.reset();
}
- fInputStream.reset();
- return setText(inputBytes);
+ if (bytesRead < 1) {
+ return setText(new byte[0]);
+ } else if ( kBufSize > bytesRead) {
+ return setText(inputBytes, (int)bytesRead);
+ } else {
+ return setText(inputBytes);
+ }
}
/**
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index d79e0fe..cbdf251 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -22,13 +22,14 @@ import static org.junit.Assert.assertTrue;
import java.io.*;
import java.nio.file.Files;
+import org.apache.tika.TikaTest;
import org.junit.Test;
-public class CharsetDetectorTest {
+public class CharsetDetectorTest extends TikaTest {
@Test
public void testTagDropper() throws IOException {
- try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ try (InputStream in = getResourceAsStream("/test-documents/resume.html")) {
CharsetDetector detector = new CharsetDetector();
detector.enableInputFilter(true);
detector.setText(in);
@@ -51,7 +52,7 @@ public class CharsetDetectorTest {
@Test
public void testEmptyOrNullDeclaredCharset() throws IOException {
- try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ try (InputStream in = getResourceAsStream("/test-documents/resume.html")) {
CharsetDetector detector = new CharsetDetector();
Reader reader = detector.getReader(in, null);
assertTrue(reader.ready());
@@ -65,15 +66,15 @@ public class CharsetDetectorTest {
public void testWin125XHeuristics() throws Exception {
//TIKA-2219
CharsetDetector detector = new CharsetDetector();
- detector.setText(getClass().getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
+ detector.setText(getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
CharsetMatch charset = detector.detect();
assertEquals("windows-1252", charset.getName());
}
@Test
- public void testSetTextConsistency() throws IOException {
+ public void testSetTextConsistency() throws Exception {
//TIKA-2475
- File file = new File("src/test/resources/test-documents/multi-language.txt");
+ File file = getResourceAsFile("/test-documents/multi-language.txt");
byte[] fileBytes = Files.readAllBytes(file.toPath());
InputStream fileStream = new ByteArrayInputStream(fileBytes);
@@ -86,4 +87,35 @@ public class CharsetDetectorTest {
assertEquals("ISO-8859-1", fromBytesDetector.detect().getName());
assertEquals("ISO-8859-1", fromStreamDetector.detect().getName());
}
+
+ @Test
+ public void testZeroLength() throws Exception {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(new byte[0]);
+ //charset detector returns "UTF-8" when there's no data
+ assertEquals("UTF-8", detector.detect().getName());
+ }
+
+ @Test
+ public void testLengthResetCorrectly() throws IOException {
+ //test that the underlying array.length is reset correctly
+ //first fill the buffer with windows-1256
+
+ String computer = "\u0627\u0644\u062D\u0627\u0633\u0648\u0628";
+ StringBuilder sb = new StringBuilder();
+ CharsetDetector detector = new CharsetDetector();
+ for (int i = 0; i < 5000; i++) {
+ sb.append(computer);
+ }
+ detector.setText(sb.toString().getBytes("windows-1256"));
+ assertEquals("windows-1256", detector.detect().getName());
+
+ sb.setLength(0);
+ for (int i = 0; i < 5; i++) {
+ sb.append(computer);
+ }
+ //then fill a small part of the buffer with UTF-8
+ detector.setText(sb.toString().getBytes("UTF-8"));
+ assertEquals("UTF-8", detector.detect().getName());
+ }
}
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.