You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/03/19 07:00:04 UTC
svn commit: r1667658 - in /tika/trunk: CHANGES.txt
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Author: mattmann
Date: Thu Mar 19 06:00:03 2015
New Revision: 1667658
URL: http://svn.apache.org/r1667658
Log:
Fix for TIKA-1365 Lower priority for XML starting with comment, allow HTML starting with comment to be detected as text/html contributed by Matthias Krueger <mk...@mkr.io> this closes #35.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Mar 19 06:00:03 2015
@@ -1,5 +1,8 @@
Release 1.8 - Current Development
+ * Tika has improved delineation in XML and HTML MIME detection
+ (TIKA-1365).
+
* Upgraded the Drew Noakes metadata-extractor to version 2.7.2
(TIKA-1576).
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Mar 19 06:00:03 2015
@@ -3878,7 +3878,6 @@
<magic priority="50">
<match value="<?xml" type="string" offset="0"/>
<match value="<?XML" type="string" offset="0"/>
- <match value="<!--" type="string" offset="0"/>
<!-- UTF-8 BOM -->
<match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
<!-- UTF-16 LE/BE -->
@@ -3886,6 +3885,12 @@
<match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
<!-- TODO: Add matches for the other possible XML encoding schemes -->
</magic>
+ <!-- XML files can start with a comment but then must not contain processing instructions.
+ This should be rare so we assign lower priority here. Priority is also lower than text/html magics
+ for them to be preferred for HTML starting with comment.-->
+ <magic priority="30">
+ <match value="<!--" type="string" offset="0"/>
+ </magic>
<glob pattern="*.xml"/>
<glob pattern="*.xsl"/>
<glob pattern="*.xsd"/>
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1667658&r1=1667657&r2=1667658&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Mar 19 06:00:03 2015
@@ -561,6 +561,22 @@ public class TestMimeTypes {
}
@Test
+ public void testXmlAndHtmlDetection() throws Exception {
+ assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
+ .getBytes("UTF-8"));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes("UTF-16LE"));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes("UTF-16BE"));
+ assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
+ .getBytes("UTF-8"));
+ assertTypeByData("text/html", "<html><body>HTML</body></html>"
+ .getBytes("UTF-8"));
+ assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
+ .getBytes("UTF-8"));
+ }
+
+ @Test
public void testWmfDetection() throws Exception {
assertTypeByName("application/x-msmetafile", "x.wmf");
assertTypeByData("application/x-msmetafile", "testWMF.wmf");