You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/15 15:41:55 UTC

[1/3] tika git commit: TIKA-2055 catch exception when totalTime out of unsigned int range in ooxml

Repository: tika
Updated Branches:
  refs/heads/master 9130bbc1f -> 6ebbd4000


TIKA-2055 catch exception when totalTime out of unsigned int range in ooxml


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/27b9cf56
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/27b9cf56
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/27b9cf56

Branch: refs/heads/master
Commit: 27b9cf566da9772961b2fac3c2aa6cc1648ab2a5
Parents: 80efc84
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:20:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:20:29 2016 -0400

----------------------------------------------------------------------
 .../parser/microsoft/ooxml/MetadataExtractor.java  |  15 +++++++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |   7 +++++++
 .../testWORD_totalTimeOutOfRange.docx              | Bin 0 -> 11047 bytes
 3 files changed, 20 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 25d3596..91d49c7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
 
@@ -113,6 +114,16 @@ public class MetadataExtractor {
                                  Metadata metadata) {
         CTProperties propsHolder = properties.getUnderlyingProperties();
 
+        //TIKA-2055, some ooxml files can include unsigned int/long values
+        //which cause this exception.
+        //For now, catch it and record as '0' because
+        //Word converts to '0' on save.
+        int totalTime = 0;
+        try {
+            totalTime = propsHolder.getTotalTime();
+        } catch (XmlValueOutOfRangeException e) {
+            //swallow for now
+        }
         addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
         addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
         addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
@@ -121,7 +132,7 @@ public class MetadataExtractor {
         addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
         addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
         addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
-        addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime());
+        addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
 
         if (propsHolder.getPages() > 0) {
             metadata.set(PagedText.N_PAGES, propsHolder.getPages());
@@ -146,7 +157,7 @@ public class MetadataExtractor {
         addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
         addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
         addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
-        addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+        addProperty(metadata, Metadata.TOTAL_TIME, totalTime);
         addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
         addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
         addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());

http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3e984de..ac62b03 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1256,6 +1256,13 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
         assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
     }
+
+    @Test
+    public void testLongForIntExceptionInSummaryDetails() throws Exception {
+        //TIKA-2055
+        assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml);
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx b/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx
new file mode 100644
index 0000000..2a5c353
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx differ


[2/3] tika git commit: Merge remote-tracking branch 'origin/master'

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1c0e6003
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1c0e6003
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1c0e6003

Branch: refs/heads/master
Commit: 1c0e60035d2bfa4857c0e48742754aa904727520
Parents: 27b9cf5 9130bbc
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:21:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:21:02 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   1 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |  52 +++++++++++++++++++
 .../org/apache/tika/mime/TestMimeTypes.java     |  10 ++++
 .../resources/test-documents/testStataDTA.dta   | Bin 0 -> 1207 bytes
 .../resources/test-documents/testStataDTA.txt   |  15 ++++++
 5 files changed, 78 insertions(+)
----------------------------------------------------------------------



[3/3] tika git commit: clean up triplicate commons-exec defs...not sure how these got in here.

Posted by ta...@apache.org.
clean up triplicate commons-exec defs...not sure how these got in here.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6ebbd400
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6ebbd400
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6ebbd400

Branch: refs/heads/master
Commit: 6ebbd4000b84484e3b729dbb789f6c5cd190b5dd
Parents: 1c0e600
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:33:36 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:33:36 2016 -0400

----------------------------------------------------------------------
 tika-parsers/pom.xml | 13 -------------
 1 file changed, 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/6ebbd400/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 58dcbf3..e885498 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -266,13 +266,6 @@
 	    <scope>compile</scope>
 	</dependency>
 
-	<!-- TIKA-2021: Tesseract OCR Parser tests -->
-	<dependency>
-	    <groupId>org.apache.commons</groupId>
-	    <artifactId>commons-lang3</artifactId>
-	    <version>3.4</version>
-	</dependency>
-
     <!-- Provided dependencies -->
     <dependency>
       <groupId>org.xerial</groupId>
@@ -294,12 +287,6 @@
     </dependency>
 
     <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-exec</artifactId>
-      <version>1.3</version>
-    </dependency>
-
-    <dependency>
       <groupId>com.googlecode.json-simple</groupId>
       <artifactId>json-simple</artifactId>
       <version>1.1.1</version>