You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/02/10 16:56:54 UTC
svn commit: r908554 -
/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: jukka
Date: Wed Feb 10 15:56:53 2010
New Revision: 908554
URL: http://svn.apache.org/viewvc?rev=908554&view=rev
Log:
TIKA-377: Error parsing HTML partial with AutoDetect parser
Add <p> and <body> as XML root tags with high likelihood of actually being HTML tag soup.
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=908554&r1=908553&r2=908554&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Feb 10 15:56:53 2010
@@ -3552,6 +3552,8 @@
-->
<root-XML localName="html"/>
<root-XML localName="link"/>
+ <root-XML localName="body"/>
+ <root-XML localName="p"/>
<magic priority="50">
<match value="<!DOCTYPE HTML" type="string" offset="0:64"/>
<match value="<!doctype html" type="string" offset="0:64"/>