You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2013/05/28 16:25:46 UTC

svn commit: r1486936 - in /tika/trunk/tika-core/src: main/resources/org/apache/tika/mime/tika-mimetypes.xml test/java/org/apache/tika/mime/MimeDetectionTest.java test/resources/org/apache/tika/mime/htmlfragment

Author: kkrugler
Date: Tue May 28 14:25:46 2013
New Revision: 1486936

URL: http://svn.apache.org/r1486936
Log:
TIKA-1102: detect fragment that starts with <div> or <DIV> as HTML.

Added:
    tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue May 28 14:25:46 2013
@@ -4533,6 +4533,8 @@
       <match value="&lt;HTML" type="string" offset="0:64"/>
       <match value="&lt;BODY" type="string" offset="0"/>
       <match value="&lt;body" type="string" offset="0"/>
+      <match value="&lt;DIV" type="string" offset="0"/>
+      <match value="&lt;div" type="string" offset="0"/>
       <match value="&lt;TITLE" type="string" offset="0"/>
       <match value="&lt;title" type="string" offset="0"/>
       <match value="&lt;h1" type="string" offset="0"/>

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Tue May 28 14:25:46 2013
@@ -63,6 +63,8 @@ public class MimeDetectionTest extends T
         testFile("text/html", "evilhtml.html");
         // add another evil html test from TIKA-357
         testFile("text/html", "testlargerbuffer.html");
+        // test fragment of HTML with <div> (TIKA-1102)
+        testFile("text/html", "htmlfragment");
     }
 
     public void testByteOrderMark() throws Exception {

Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment?rev=1486936&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment (added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment Tue May 28 14:25:46 2013
@@ -0,0 +1,18 @@
+<div id="leftcol">
+	  <ul>
+        <li><a href="/mission/sec/sec.html"> Security and Information Sciences Home&nbsp;&rsaquo;</a>        </li>
+        <li><a href="/mission/sec/publications/-publications.html">Publications&nbsp;&rsaquo;</a> </li>
+        <li><a href="/mission/sec/corpora/corpora.html">Corpora&nbsp;&rsaquo;</a> </li>
+        <li><a href="/mission/sec/softwaretools/tools.html">Software Tools&nbsp;&rsaquo;</a></li>
+        <li><a href="/mission/sec/CSO/CSO.html"> Systems and Operations&nbsp;&rsaquo;</a>
+          <ul>
+            <li><a href="/mission/sec/publications/-publications.html">Publications &rsaquo;</a></li>
+            <li><a href="/mission/sec/CSO/biographies/CSObios.html">Biographies&nbsp;&rsaquo;</a></li>
+          </ul>
+        </li>
+        <li><a href="/mission/sec/CST/CST.html"> Systems and Technology&nbsp;&rsaquo;</a> </li>
+        <li><a href="/mission/sec/CSA/CSA.html"> System Assessments&nbsp;&rsaquo;</a> </li>
+	    <li><a href="/mission/sec/HLT/HLT.html">Human Language Technology&nbsp;&rsaquo;</a>
+<li><a href="/mission/sec/computing/computing.html">Computing and Analytics&nbsp;&rsaquo;</a></li>
+  </ul>
+</div>