You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/21 19:51:46 UTC
[2/2] tika git commit: * Maintain passed-in mime in TXTParser
(TIKA-2047).
* Maintain passed-in mime in TXTParser (TIKA-2047).
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32d9ece8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32d9ece8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32d9ece8
Branch: refs/heads/2.x
Commit: 32d9ece8d84986de240087a580e094de3f879f3c
Parents: 12b1d43
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 15:51:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 15:51:02 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 ++
.../main/java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++-
.../java/org/apache/tika/parser/txt/TXTParserTest.java | 10 ++++++++--
3 files changed, 21 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 662217d..46a5894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Maintain passed-in mime in TXTParser (TIKA-2047).
+
* Upgrade to POI 3.15-final (TIKA-2013).
* Upgrade to PDFBox 2.0.3 (TIKA-2051).
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 2b20495..2e7bb19 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser {
try (AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata,
context.get(ServiceLoader.class, LOADER))) {
+ //try to get detected content type; could be a subclass of text/plain
+ //such as vcal, etc.
+ String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType mediaType = MediaType.TEXT_PLAIN;
+ if (incomingMime != null) {
+ MediaType tmpMediaType = MediaType.parse(incomingMime);
+ if (tmpMediaType != null) {
+ mediaType = tmpMediaType;
+ }
+ }
Charset charset = reader.getCharset();
- MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 9d9a138..17e5ba1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -196,7 +196,7 @@ public class TXTParserTest extends TikaTest {
parser.parse(
new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
@@ -268,7 +268,13 @@ public class TXTParserTest extends TikaTest {
parser.parse(
new ByteArrayInputStream(text.getBytes(UTF_8)),
new BodyContentHandler(), r.metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/binary; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
}
+ //TIKA-2047
+ @Test
+ public void testSubclassingMimeTypesRemain() throws Exception {
+ XMLResult r = getXML("testVCalendar.vcs");
+ assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
}