You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/10/11 13:19:03 UTC
[tika] 03/03: TIKA-2475 mods and some new tests/cleanup for CharsetDetector. This closes #210.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 94850f2e7c7d3df6a06a924fc6d643c0f6181643
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Oct 11 09:18:47 2017 -0400

    TIKA-2475 mods and some new tests/cleanup for CharsetDetector.
    This closes #210.
---
 CHANGES.txt                                        |  4 ++
 .../apache/tika/parser/txt/CharsetDetector.java    | 32 ++++++++++------
 .../tika/parser/txt/CharsetDetectorTest.java       | 44 +++++++++++++++++++---
 3 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b6eb6ab..d9d0eb7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 Release 1.17 - ???
 
+  * Fix bug in CharsetDetector that led to different detected charsets
+    depending on whether user setText with a byte[] or an InputStream
+    via Sean Story (TIKA-2475).
+
   * Remove JAXB for easier use with Java 9 via Robert Munteanu (TIKA-2466).
 
   * Upgrade to POI 3.17 (TIKA-2429).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index cee7090..172cb46 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -8,6 +8,8 @@
  */
 package org.apache.tika.parser.txt;
 
+import org.apache.poi.util.IOUtils;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
@@ -190,8 +192,12 @@ public class CharsetDetector {
      * @stable ICU 3.4
      */
     public CharsetDetector setText(byte[] in) {
+        return setText(in, in.length);
+    }
+
+    private CharsetDetector setText(byte[] in, int length) {
         fRawInput = in;
-        fRawLength = in.length;
+        fRawLength = length;
 
         MungeInput();
         return this;
@@ -217,19 +223,23 @@ public class CharsetDetector {
         byte[] inputBytes = new byte[kBufSize];   // Always make a new buffer because the
         //   previous one may have come from the caller,
         //   in which case we can't touch it.
-        int length = 0;
-        int remainingLength = kBufSize;
-        while (remainingLength > 0) {
-            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
-            int bytesRead = fInputStream.read(inputBytes, length, remainingLength);
-            if (bytesRead <= 0) {
-                break;
+        long bytesRead = -1;
+        try {
+            bytesRead = IOUtils.readFully(fInputStream, inputBytes);
+            if (bytesRead >= Integer.MAX_VALUE) {
+                throw new IOException("Can't have read > Integer.MAX_VALUE bytes");
             }
-            remainingLength -= bytesRead;
+        } finally {
+            fInputStream.reset();
         }
-        fInputStream.reset();
 
-        return setText(inputBytes);
+        if (bytesRead < 1) {
+            return setText(new byte[0]);
+        } else if ( kBufSize > bytesRead) {
+            return setText(inputBytes, (int)bytesRead);
+        } else {
+            return setText(inputBytes);
+        }
     }
 
     /**
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index d79e0fe..cbdf251 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -22,13 +22,14 @@ import static org.junit.Assert.assertTrue;
 import java.io.*;
 import java.nio.file.Files;
 
+import org.apache.tika.TikaTest;
 import org.junit.Test;
 
-public class CharsetDetectorTest {
+public class CharsetDetectorTest extends TikaTest {
 
     @Test
     public void testTagDropper() throws IOException {
-        try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+        try (InputStream in = getResourceAsStream("/test-documents/resume.html")) {
             CharsetDetector detector = new CharsetDetector();
             detector.enableInputFilter(true);
             detector.setText(in);
@@ -51,7 +52,7 @@ public class CharsetDetectorTest {
 
     @Test
     public void testEmptyOrNullDeclaredCharset() throws IOException {
-        try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+        try (InputStream in = getResourceAsStream("/test-documents/resume.html")) {
             CharsetDetector detector = new CharsetDetector();
             Reader reader = detector.getReader(in, null);
             assertTrue(reader.ready());
@@ -65,15 +66,15 @@ public class CharsetDetectorTest {
     public void testWin125XHeuristics() throws Exception {
         //TIKA-2219
         CharsetDetector detector = new CharsetDetector();
-        detector.setText(getClass().getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
+        detector.setText(getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
         CharsetMatch charset =  detector.detect();
         assertEquals("windows-1252", charset.getName());
     }
 
     @Test
-    public void testSetTextConsistency() throws IOException {
+    public void testSetTextConsistency() throws Exception {
         //TIKA-2475
-        File file = new File("src/test/resources/test-documents/multi-language.txt");
+        File file = getResourceAsFile("/test-documents/multi-language.txt");
         byte[] fileBytes = Files.readAllBytes(file.toPath());
         InputStream fileStream = new ByteArrayInputStream(fileBytes);
 
@@ -86,4 +87,35 @@ public class CharsetDetectorTest {
         assertEquals("ISO-8859-1", fromBytesDetector.detect().getName());
         assertEquals("ISO-8859-1", fromStreamDetector.detect().getName());
     }
+
+    @Test
+    public void testZeroLength() throws Exception {
+        CharsetDetector detector = new CharsetDetector();
+        detector.setText(new byte[0]);
+        //charset detector returns "UTF-8" when there's no data
+        assertEquals("UTF-8", detector.detect().getName());
+    }
+
+    @Test
+    public void testLengthResetCorrectly() throws IOException {
+        //test that the underlying array.length is reset correctly
+        //first fill the buffer with windows-1256
+
+        String computer = "\u0627\u0644\u062D\u0627\u0633\u0648\u0628";
+        StringBuilder sb = new StringBuilder();
+        CharsetDetector detector = new CharsetDetector();
+        for (int i = 0; i < 5000; i++) {
+            sb.append(computer);
+        }
+        detector.setText(sb.toString().getBytes("windows-1256"));
+        assertEquals("windows-1256", detector.detect().getName());
+
+        sb.setLength(0);
+        for (int i = 0; i < 5; i++) {
+            sb.append(computer);
+        }
+        //then fill a small part of the buffer with UTF-8
+        detector.setText(sb.toString().getBytes("UTF-8"));
+        assertEquals("UTF-8", detector.detect().getName());
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.