You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/03/19 07:00:04 UTC

svn commit: r1667658 - in /tika/trunk: CHANGES.txt tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Author: mattmann
Date: Thu Mar 19 06:00:03 2015
New Revision: 1667658

URL: http://svn.apache.org/r1667658
Log:
Fix for TIKA-1365  Lower priority for XML starting with comment, allow HTML starting with comment to be detected as text/html contributed by Matthias Krueger <mk...@mkr.io> this closes #35.

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Mar 19 06:00:03 2015
@@ -1,5 +1,8 @@
 Release 1.8 - Current Development
 
+  * Tika has improved delineation in XML and HTML MIME detection
+    (TIKA-1365).
+
   * Upgraded the Drew Noakes metadata-extractor to version 2.7.2
     (TIKA-1576).
 

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Mar 19 06:00:03 2015
@@ -3878,7 +3878,6 @@
     <magic priority="50">
       <match value="&lt;?xml" type="string" offset="0"/>
       <match value="&lt;?XML" type="string" offset="0"/>
-      <match value="&lt;!--" type="string" offset="0"/>
       <!-- UTF-8 BOM -->
       <match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
       <!-- UTF-16 LE/BE -->
@@ -3886,6 +3885,12 @@
       <match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
       <!-- TODO: Add matches for the other possible XML encoding schemes -->
     </magic>
+    <!-- XML files can start with a comment but then must not contain processing instructions.
+         This should be rare so we assign lower priority here. Priority is also lower than text/html magics
+         for them to be preferred for HTML starting with comment.-->
+    <magic priority="30">
+      <match value="&lt;!--" type="string" offset="0"/>
+    </magic>
     <glob pattern="*.xml"/>
     <glob pattern="*.xsl"/>
     <glob pattern="*.xsd"/>

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Mar 19 06:00:03 2015
@@ -561,6 +561,22 @@ public class TestMimeTypes {
     }
 
     @Test
+    public void testXmlAndHtmlDetection() throws Exception {
+        assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
+                .getBytes("UTF-8"));
+        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+                .getBytes("UTF-16LE"));
+        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+                .getBytes("UTF-16BE"));
+        assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
+                .getBytes("UTF-8"));
+        assertTypeByData("text/html", "<html><body>HTML</body></html>"
+                .getBytes("UTF-8"));
+        assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
+                .getBytes("UTF-8"));
+    }
+
+    @Test
     public void testWmfDetection() throws Exception {
         assertTypeByName("application/x-msmetafile", "x.wmf");
         assertTypeByData("application/x-msmetafile", "testWMF.wmf");