You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/20 15:32:55 UTC

svn commit: r1328370 - in /tika/trunk/tika-core/src: main/resources/org/apache/tika/mime/tika-mimetypes.xml test/java/org/apache/tika/mime/MimeDetectionTest.java test/resources/org/apache/tika/mime/test-utf8-bom.xml

Author: nick
Date: Fri Apr 20 13:32:55 2012
New Revision: 1328370

URL: http://svn.apache.org/viewvc?rev=1328370&view=rev
Log:
TIKA-897 Detect XML files that start with the UTF-8 BOM, plus test

Added:
    tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml   (with props)
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Apr 20 13:32:55 2012
@@ -2940,6 +2940,9 @@
       <match value="&lt;?xml" type="string" offset="0"/>
       <match value="&lt;?XML" type="string" offset="0"/>
       <match value="&lt;!--" type="string" offset="0"/>
+      <!-- UTF-8 BOM -->
+      <match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
+      <!-- UTF-16 LE/BE -->
       <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0"/>
       <match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
       <!-- TODO: Add matches for the other possible XML encoding schemes -->

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Fri Apr 20 13:32:55 2012
@@ -46,6 +46,7 @@ public class MimeDetectionTest extends T
         testFile("text/html", "test.html");
         testFile("application/xml", "test-iso-8859-1.xml");
         testFile("application/xml", "test-utf8.xml");
+        testFile("application/xml", "test-utf8-bom.xml");
         testFile("application/xml", "test-utf16le.xml");
         testFile("application/xml", "test-utf16be.xml");
         testFile("application/xml", "test-long-comment.xml");

Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml?rev=1328370&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml (added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml Fri Apr 20 13:32:55 2012
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file

Propchange: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml