You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/21 19:51:45 UTC

[1/2] tika git commit: TIKA-2013 -- upgrade to POI 3.15 -- don't forget to close new NPOIFS and MAPIMessage

Repository: tika
Updated Branches:
  refs/heads/2.x 1b32e3186 -> 32d9ece8d


TIKA-2013 -- upgrade to POI 3.15 -- don't forget to close new NPOIFS and MAPIMessage


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/12b1d435
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/12b1d435
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/12b1d435

Branch: refs/heads/2.x
Commit: 12b1d435bbdc5df9d5e396285c83ddeda44240ae
Parents: 1b32e31
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 14:23:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 14:23:00 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |  4 +--
 tika-bundle/pom.xml                             |  4 +--
 .../tika-parser-office-bundle/pom.xml           |  1 +
 tika-parser-modules/pom.xml                     |  2 +-
 .../parser/microsoft/JackcessExtractor.java     |  5 +--
 .../tika/parser/microsoft/OfficeParser.java     | 38 ++++++++++++--------
 .../tika/parser/microsoft/OutlookExtractor.java | 12 +++++--
 7 files changed, 42 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index baee8b4..662217d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Upgrade to POI 3.15-final (TIKA-2013).
+
   * Upgrade to PDFBox 2.0.3 (TIKA-2051).
 
   * Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040).
@@ -45,8 +47,6 @@ Release 1.14 - ???
      * iCal and vCalendar (TIKA-2006)
      * MBOX (TIKA-2042)
 
-  * Upgrade to PDFBox 2.0.2 (TIKA-1996).
-
   * Add configurable maximum threshold for number of events extracted
     from the XMP Media Management Schema in JempboxExtractor (TIKA-1999).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 3b7a6ce..e8f3e83 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -126,7 +126,7 @@
             <Embed-Dependency>
               tika-parsers;inline=true,
               commons-compress, xz, commons-codec, commons-csv,
-              commons-io, commons-exec, junrar,
+              commons-io, commons-exec, commons-collections4, junrar,
               pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
               poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
               curvesapi,
@@ -444,4 +444,4 @@
     <system>Jenkins</system>
     <url>https://builds.apache.org/job/Tika-trunk/</url>
   </ciManagement>
-</project>
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index f6b2169..1529c97 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -65,6 +65,7 @@
               commons-lang;inline=true,
               commons-io;inline=true,
               commons-codec;inline=true,
+              commons-collections4;inline=true,
               poi;inline=true,
               poi-scratchpad;inline=true,
               poi-ooxml;inline=true,

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index ef92a7c..dc3b409 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
   
   <properties>
-    <poi.version>3.15-beta1</poi.version>
+    <poi.version>3.15</poi.version>
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.10</codec.version>
     <pdfbox.version>2.0.3</pdfbox.version>

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index fb8a2c2..4f26ff0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -328,8 +328,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
     }
 
     private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
-        NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
-        handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+        try (NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream())) {
+            handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+        }
     }
 
     String formatCurrency(Double d, DataType type) {

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index f5f9f3e..b6681aa 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -95,26 +96,33 @@ public class OfficeParser extends AbstractParser {
 
         final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
-        if (tstream == null) {
-            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
-        } else {
-            final Object container = tstream.getOpenContainer();
-            if (container instanceof NPOIFSFileSystem) {
-                root = ((NPOIFSFileSystem) container).getRoot();
-            } else if (container instanceof DirectoryNode) {
-                root = (DirectoryNode) container;
+        NPOIFSFileSystem mustCloseFs = null;
+        try {
+            if (tstream == null) {
+                mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+                root = mustCloseFs.getRoot();
             } else {
-                NPOIFSFileSystem fs;
-                if (tstream.hasFile()) {
-                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
+                final Object container = tstream.getOpenContainer();
+                if (container instanceof NPOIFSFileSystem) {
+                    root = ((NPOIFSFileSystem) container).getRoot();
+                } else if (container instanceof DirectoryNode) {
+                    root = (DirectoryNode) container;
                 } else {
-                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+                    NPOIFSFileSystem fs = null;
+                    if (tstream.hasFile()) {
+                        fs = new NPOIFSFileSystem(tstream.getFile(), true);
+                    } else {
+                        fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+                    }
+                    //tstream will close the fs, no need to close this below
+                    tstream.setOpenContainer(fs);
+                    root = fs.getRoot();
                 }
-                tstream.setOpenContainer(fs);
-                root = fs.getRoot();
             }
+            parse(root, context, metadata, xhtml);
+        } finally {
+            IOUtils.closeQuietly(mustCloseFs);
         }
-        parse(root, context, metadata, xhtml);
         xhtml.endDocument();
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a922c5d..74a95e7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
@@ -62,8 +64,6 @@ import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * Outlook Message Parser.
  */
@@ -260,6 +260,14 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
             }
         } catch (ChunkNotFoundException e) {
             throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+        } finally {
+            if (msg != null) {
+                try {
+                    msg.close();
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
         }
     }
 


[2/2] tika git commit: * Maintain passed-in mime in TXTParser (TIKA-2047).

Posted by ta...@apache.org.
 * Maintain passed-in mime in TXTParser (TIKA-2047).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32d9ece8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32d9ece8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32d9ece8

Branch: refs/heads/2.x
Commit: 32d9ece8d84986de240087a580e094de3f879f3c
Parents: 12b1d43
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 15:51:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 15:51:02 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                             |  2 ++
 .../main/java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++-
 .../java/org/apache/tika/parser/txt/TXTParserTest.java  | 10 ++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 662217d..46a5894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Maintain passed-in mime in TXTParser (TIKA-2047).
+
   * Upgrade to POI 3.15-final (TIKA-2013).
 
   * Upgrade to PDFBox 2.0.3 (TIKA-2051).

http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 2b20495..2e7bb19 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser {
         try (AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata,
                 context.get(ServiceLoader.class, LOADER))) {
+            //try to get detected content type; could be a subclass of text/plain
+            //such as vcal, etc.
+            String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType mediaType = MediaType.TEXT_PLAIN;
+            if (incomingMime != null) {
+                MediaType tmpMediaType = MediaType.parse(incomingMime);
+                if (tmpMediaType != null) {
+                    mediaType = tmpMediaType;
+                }
+            }
             Charset charset = reader.getCharset();
-            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            MediaType type = new MediaType(mediaType, charset);
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, charset.name());

http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 9d9a138..17e5ba1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -196,7 +196,7 @@ public class TXTParserTest extends TikaTest {
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }
 
@@ -268,7 +268,13 @@ public class TXTParserTest extends TikaTest {
         parser.parse(
                 new ByteArrayInputStream(text.getBytes(UTF_8)),
                 new BodyContentHandler(), r.metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/binary; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    //TIKA-2047
+    @Test
+    public void testSubclassingMimeTypesRemain() throws Exception {
+        XMLResult r = getXML("testVCalendar.vcs");
+        assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
 }