You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/01 19:22:07 UTC

[tika] branch 2.x updated (4e1e87f -> 62e5a84)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  4e1e87f   TIKA-2348 -- include caught exception in EMF/WMF rethrows
       new  6930ff0   TIKA-2311 -- try OPC before ZipFile.  This can work better on some truncated files.
       new  62e5a84   TIKA-2350

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 12 ++++++++---
 .../org/apache/tika/parser/opc/OPCDetector.java    |  3 +--
 .../tika/parser/pkg/ZipContainerDetector.java      | 25 +++++++++++++---------
 3 files changed, 25 insertions(+), 15 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 01/02: TIKA-2311 -- try OPC before ZipFile. This can work better on some truncated files.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6930ff0251e9e93ee969a9f1287c902d31045b59
Author: tballison <ta...@mitre.org>
AuthorDate: Mon May 1 15:20:30 2017 -0400

    TIKA-2311 -- try OPC before ZipFile.  This can work better on some truncated files.
---
 .../org/apache/tika/parser/opc/OPCDetector.java    |  3 +--
 .../tika/parser/pkg/ZipContainerDetector.java      | 25 +++++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
index a8fe200..21032d1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
@@ -76,8 +76,7 @@ public class OPCDetector implements Detector {
             
             return type;
         } catch (InvalidFormatException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
+            //swallow
         }finally {
             tmp.close();
         }
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 0a12e15..1980bd6 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -131,13 +131,20 @@ public class ZipContainerDetector extends AbstractDetector {
     }
 
     private MediaType detectZipFormat(TikaInputStream tis) {
+
+        //try opc first because opening a package
+        //will not necessarily throw an exception for
+        //truncated files.
+        MediaType type = detectOPCBased(tis);
+        if (type != null) {
+            return type;
+        }
+
         try {
             ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
             try {
-                MediaType type = detectOpenDocument(zip);
-                if (type == null) {
-                    type = detectOPCBased(zip, tis);
-                }
+                type = detectOpenDocument(zip);
+
                 if (type == null) {
                     type = detectIWork(zip);
                 }
@@ -191,18 +198,16 @@ public class ZipContainerDetector extends AbstractDetector {
         }
     }
 
-    private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+    private MediaType detectOPCBased(TikaInputStream stream) {
         try {
-            if (zip.getEntry("_rels/.rels") != null
-                    || zip.getEntry("[Content_Types].xml") != null) {
+//            if (zip.getEntry("_rels/.rels") != null
+  //                  || zip.getEntry("[Content_Types].xml") != null) {
                 MediaType type = this.opcDetector.detect(stream, null);
                 if (type != null) return type;
                 
                 // We don't know what it is, sorry
                 return null;
-            } else {
-                return null;
-            }
+
         } catch (IOException e) {
             return null;
         } catch (RuntimeException e) {

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 02/02: TIKA-2350

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 62e5a8477b5cd8a4354c152bffe237a00051b7a4
Author: tballison <ta...@mitre.org>
AuthorDate: Mon May 1 15:21:58 2017 -0400

    TIKA-2350
---
 .../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 44950aa..0f91073 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.parser.pdf;
 
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
-
 import javax.xml.stream.XMLStreamException;
 import java.awt.image.BufferedImage;
 import java.io.BufferedInputStream;
@@ -95,6 +93,8 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
 class AbstractPDF2XHTML extends PDFTextStripper {
 
     enum ActionTrigger {
@@ -456,7 +456,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
             xhtml.startDocument();
-            handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
+            try
+            {
+                handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
+            } catch (IOException e) {
+                //See PDFBOX-3773
+                //swallow -- no need to report this
+            }
         } catch (TikaException|SAXException e) {
             throw new IOExceptionWithCause("Unable to start a document", e);
         }

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.