You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2013/05/28 16:25:46 UTC
svn commit: r1486936 - in /tika/trunk/tika-core/src:
main/resources/org/apache/tika/mime/tika-mimetypes.xml
test/java/org/apache/tika/mime/MimeDetectionTest.java
test/resources/org/apache/tika/mime/htmlfragment
Author: kkrugler
Date: Tue May 28 14:25:46 2013
New Revision: 1486936
URL: http://svn.apache.org/r1486936
Log:
TIKA-1102: detect fragment that starts with <div> or <DIV> as HTML.
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue May 28 14:25:46 2013
@@ -4533,6 +4533,8 @@
<match value="<HTML" type="string" offset="0:64"/>
<match value="<BODY" type="string" offset="0"/>
<match value="<body" type="string" offset="0"/>
+ <match value="<DIV" type="string" offset="0"/>
+ <match value="<div" type="string" offset="0"/>
<match value="<TITLE" type="string" offset="0"/>
<match value="<title" type="string" offset="0"/>
<match value="<h1" type="string" offset="0"/>
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Tue May 28 14:25:46 2013
@@ -63,6 +63,8 @@ public class MimeDetectionTest extends T
testFile("text/html", "evilhtml.html");
// add another evil html test from TIKA-357
testFile("text/html", "testlargerbuffer.html");
+ // test fragment of HTML with <div> (TIKA-1102)
+ testFile("text/html", "htmlfragment");
}
public void testByteOrderMark() throws Exception {
Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment?rev=1486936&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment (added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment Tue May 28 14:25:46 2013
@@ -0,0 +1,18 @@
+<div id="leftcol">
+ <ul>
+ <li><a href="/mission/sec/sec.html"> Security and Information Sciences Home ›</a> </li>
+ <li><a href="/mission/sec/publications/-publications.html">Publications ›</a> </li>
+ <li><a href="/mission/sec/corpora/corpora.html">Corpora ›</a> </li>
+ <li><a href="/mission/sec/softwaretools/tools.html">Software Tools ›</a></li>
+ <li><a href="/mission/sec/CSO/CSO.html"> Systems and Operations ›</a>
+ <ul>
+ <li><a href="/mission/sec/publications/-publications.html">Publications ›</a></li>
+ <li><a href="/mission/sec/CSO/biographies/CSObios.html">Biographies ›</a></li>
+ </ul>
+ </li>
+ <li><a href="/mission/sec/CST/CST.html"> Systems and Technology ›</a> </li>
+ <li><a href="/mission/sec/CSA/CSA.html"> System Assessments ›</a> </li>
+ <li><a href="/mission/sec/HLT/HLT.html">Human Language Technology ›</a>
+<li><a href="/mission/sec/computing/computing.html">Computing and Analytics ›</a></li>
+ </ul>
+</div>