You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/15 15:41:55 UTC
[1/3] tika git commit: TIKA-2055 catch exception when totalTime out
of unsigned int range in ooxml
Repository: tika
Updated Branches:
refs/heads/master 9130bbc1f -> 6ebbd4000
TIKA-2055 catch exception when totalTime out of unsigned int range in ooxml
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/27b9cf56
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/27b9cf56
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/27b9cf56
Branch: refs/heads/master
Commit: 27b9cf566da9772961b2fac3c2aa6cc1648ab2a5
Parents: 80efc84
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:20:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:20:29 2016 -0400
----------------------------------------------------------------------
.../parser/microsoft/ooxml/MetadataExtractor.java | 15 +++++++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 7 +++++++
.../testWORD_totalTimeOutOfRange.docx | Bin 0 -> 11047 bytes
3 files changed, 20 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 25d3596..91d49c7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
@@ -113,6 +114,16 @@ public class MetadataExtractor {
Metadata metadata) {
CTProperties propsHolder = properties.getUnderlyingProperties();
+ //TIKA-2055, some ooxml files can include unsigned int/long values
+ //which cause this exception.
+ //For now, catch it and record as '0' because
+ //Word converts to '0' on save.
+ int totalTime = 0;
+ try {
+ totalTime = propsHolder.getTotalTime();
+ } catch (XmlValueOutOfRangeException e) {
+ //swallow for now
+ }
addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
@@ -121,7 +132,7 @@ public class MetadataExtractor {
addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
- addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
if (propsHolder.getPages() > 0) {
metadata.set(PagedText.N_PAGES, propsHolder.getPages());
@@ -146,7 +157,7 @@ public class MetadataExtractor {
addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
- addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, Metadata.TOTAL_TIME, totalTime);
addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3e984de..ac62b03 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1256,6 +1256,13 @@ public class OOXMLParserTest extends TikaTest {
assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
}
+
+ @Test
+ public void testLongForIntExceptionInSummaryDetails() throws Exception {
+ //TIKA-2055
+ assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/27b9cf56/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx b/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx
new file mode 100644
index 0000000..2a5c353
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_totalTimeOutOfRange.docx differ
[2/3] tika git commit: Merge remote-tracking branch 'origin/master'
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1c0e6003
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1c0e6003
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1c0e6003
Branch: refs/heads/master
Commit: 1c0e60035d2bfa4857c0e48742754aa904727520
Parents: 27b9cf5 9130bbc
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:21:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:21:02 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 1 +
.../org/apache/tika/mime/tika-mimetypes.xml | 52 +++++++++++++++++++
.../org/apache/tika/mime/TestMimeTypes.java | 10 ++++
.../resources/test-documents/testStataDTA.dta | Bin 0 -> 1207 bytes
.../resources/test-documents/testStataDTA.txt | 15 ++++++
5 files changed, 78 insertions(+)
----------------------------------------------------------------------
[3/3] tika git commit: clean up triplicate commons-exec defs...not
sure how these got in here.
Posted by ta...@apache.org.
clean up triplicate commons-exec defs...not sure how these got in here.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6ebbd400
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6ebbd400
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6ebbd400
Branch: refs/heads/master
Commit: 6ebbd4000b84484e3b729dbb789f6c5cd190b5dd
Parents: 1c0e600
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 15 11:33:36 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 15 11:33:36 2016 -0400
----------------------------------------------------------------------
tika-parsers/pom.xml | 13 -------------
1 file changed, 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/6ebbd400/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 58dcbf3..e885498 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -266,13 +266,6 @@
<scope>compile</scope>
</dependency>
- <!-- TIKA-2021: Tesseract OCR Parser tests -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- <version>3.4</version>
- </dependency>
-
<!-- Provided dependencies -->
<dependency>
<groupId>org.xerial</groupId>
@@ -294,12 +287,6 @@
</dependency>
<dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-exec</artifactId>
- <version>1.3</version>
- </dependency>
-
- <dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>