You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2012/08/13 19:53:38 UTC
svn commit: r1372530 -
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Author: kkrugler
Date: Mon Aug 13 17:53:38 2012
New Revision: 1372530
URL: http://svn.apache.org/viewvc?rev=1372530&view=rev
Log:
TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
Added test to confirm that it was fixed by Jukka's previous changes to
the charset detection & CONTENT_TYPE handling code.
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1372530&r1=1372529&r2=1372530&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Mon Aug 13 17:53:38 2012
@@ -252,5 +252,28 @@ public class TXTParserTest extends TestC
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
+
+ /**
+ * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+ */
+ public void testCharsetDetectionWithShortSnipet() throws Exception {
+ final String text = "Hello, World!";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+ // Now verify that if we tell the parser the encoding is UTF-8, that's what
+ // we get back (see TIKA-868)
+ metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ }
}