You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/09/03 13:11:50 UTC

svn commit: r1700986 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/detect/

Author: nick
Date: Thu Sep  3 11:11:50 2015
New Revision: 1700986

URL: http://svn.apache.org/r1700986
Log:
TIKA-1728 HWP v5(+?) detection

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Sep  3 11:11:50 2015
@@ -3230,14 +3230,20 @@
   </mime-type>
 
   <mime-type type="application/x-hwp">
+    <_comment>Hangul Word Processor File</_comment>
     <magic priority="50">
       <!--
         TIKA-330: Detection pattern based on signature strings from
         the hwpfilter/source/hwpfile.cpp file in OpenOffice.org.
+        This is for HWP before v5, v5 onwards use OLE2
       -->
       <match value="HWP Document File V" type="string" offset="0"/>
     </magic>
   </mime-type>
+  <mime-type type="application/x-hwp-v5">
+    <_comment>Hangul Word Processor File v5</_comment>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
 
   <mime-type type="application/x-ibooks+zip">
     <sub-class-of type="application/epub+zip" />

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Thu Sep  3 11:11:50 2015
@@ -125,6 +125,10 @@ public class POIFSContainerDetector impl
      */
     public static final MediaType SLDWORKS = application("sldworks");
     /**
+     * Hangul Word Processor (Korean)
+     */
+    public static final MediaType HWP = application("x-hwp-v5");
+    /**
      * Serial version UID
      */
     private static final long serialVersionUID = -3028021741663605293L;
@@ -196,6 +200,9 @@ public class POIFSContainerDetector impl
                 } else {
                     return processCompObjFormatType(root);
                 }
+            } else if (names.contains("\u0005HwpSummaryInformation")) {
+                // Hangul Word Processor v5+ (previous aren't OLE2-based)
+                return HWP;
             } else if (names.contains("WksSSWorkBook")) {
                 // This check has to be before names.contains("Workbook")
                 // Works 7.0 spreadsheet files contain both

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1700986&r1=1700985&r2=1700986&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Thu Sep  3 11:11:50 2015
@@ -106,6 +106,8 @@ public class TestContainerAwareDetector
         assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
         assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
         
+        assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5");
+        
         
         // With the filename and data
         assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");