You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/09/03 13:11:50 UTC
svn commit: r1700986 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/detect/
Author: nick
Date: Thu Sep 3 11:11:50 2015
New Revision: 1700986
URL: http://svn.apache.org/r1700986
Log:
TIKA-1728 HWP v5(+?) detection
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Sep 3 11:11:50 2015
@@ -3230,14 +3230,20 @@
</mime-type>
<mime-type type="application/x-hwp">
+ <_comment>Hangul Word Processor File</_comment>
<magic priority="50">
<!--
TIKA-330: Detection pattern based on signature strings from
the hwpfilter/source/hwpfile.cpp file in OpenOffice.org.
+ This is for HWP before v5, v5 onwards use OLE2
-->
<match value="HWP Document File V" type="string" offset="0"/>
</magic>
</mime-type>
+ <mime-type type="application/x-hwp-v5">
+ <_comment>Hangul Word Processor File v5</_comment>
+ <sub-class-of type="application/x-tika-ooxml"/>
+ </mime-type>
<mime-type type="application/x-ibooks+zip">
<sub-class-of type="application/epub+zip" />
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Thu Sep 3 11:11:50 2015
@@ -125,6 +125,10 @@ public class POIFSContainerDetector impl
*/
public static final MediaType SLDWORKS = application("sldworks");
/**
+ * Hangul Word Processor (Korean)
+ */
+ public static final MediaType HWP = application("x-hwp-v5");
+ /**
* Serial version UID
*/
private static final long serialVersionUID = -3028021741663605293L;
@@ -196,6 +200,9 @@ public class POIFSContainerDetector impl
} else {
return processCompObjFormatType(root);
}
+ } else if (names.contains("\u0005HwpSummaryInformation")) {
+ // Hangul Word Processor v5+ (previous aren't OLE2-based)
+ return HWP;
} else if (names.contains("WksSSWorkBook")) {
// This check has to be before names.contains("Workbook")
// Works 7.0 spreadsheet files contain both
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Thu Sep 3 11:11:50 2015
@@ -106,6 +106,8 @@ public class TestContainerAwareDetector
assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
+ assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5");
+
// With the filename and data
assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");