You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/21 19:51:46 UTC

[2/2] tika git commit: * Maintain passed-in mime in TXTParser (TIKA-2047).

 * Maintain passed-in mime in TXTParser (TIKA-2047).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32d9ece8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32d9ece8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32d9ece8

Branch: refs/heads/2.x
Commit: 32d9ece8d84986de240087a580e094de3f879f3c
Parents: 12b1d43
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 15:51:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 15:51:02 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                             |  2 ++
 .../main/java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++-
 .../java/org/apache/tika/parser/txt/TXTParserTest.java  | 10 ++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 662217d..46a5894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Maintain passed-in mime in TXTParser (TIKA-2047).
+
   * Upgrade to POI 3.15-final (TIKA-2013).
 
   * Upgrade to PDFBox 2.0.3 (TIKA-2051).

http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 2b20495..2e7bb19 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser {
         try (AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata,
                 context.get(ServiceLoader.class, LOADER))) {
+            //try to get detected content type; could be a subclass of text/plain
+            //such as vcal, etc.
+            String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType mediaType = MediaType.TEXT_PLAIN;
+            if (incomingMime != null) {
+                MediaType tmpMediaType = MediaType.parse(incomingMime);
+                if (tmpMediaType != null) {
+                    mediaType = tmpMediaType;
+                }
+            }
             Charset charset = reader.getCharset();
-            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            MediaType type = new MediaType(mediaType, charset);
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, charset.name());

http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 9d9a138..17e5ba1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -196,7 +196,7 @@ public class TXTParserTest extends TikaTest {
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }
 
@@ -268,7 +268,13 @@ public class TXTParserTest extends TikaTest {
         parser.parse(
                 new ByteArrayInputStream(text.getBytes(UTF_8)),
                 new BodyContentHandler(), r.metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/binary; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    //TIKA-2047
+    @Test
+    public void testSubclassingMimeTypesRemain() throws Exception {
+        XMLResult r = getXML("testVCalendar.vcs");
+        assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
 }