You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/08/11 16:25:20 UTC

[tika] branch branch_1x updated: TIKA-3489 -- detect robots.txt files as text/x-robots

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 27e7eac  TIKA-3489 -- detect robots.txt files as text/x-robots
     new 25552a9  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
27e7eac is described below

commit 27e7eac5fc7c2122076237c191a2bd0aa2748aa4
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 11 12:25:00 2021 -0400

    TIKA-3489 -- detect robots.txt files as text/x-robots
---
 .../resources/org/apache/tika/mime/tika-mimetypes.xml   | 17 ++++++++++++++++-
 .../test/java/org/apache/tika/mime/TestMimeTypes.java   |  6 ++++++
 .../src/test/resources/test-documents/testRobots.txt    |  9 +++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 87e50b7..68caa88 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -1943,7 +1943,22 @@
     </magic>
     <sub-class-of type="text/plain"/>
   </mime-type>
-
+  <mime-type type="text/x-robots">
+    <!-- robots.txt file -->
+    <!-- draft: https://datatracker.ietf.org/doc/html/draft-koster-rep -->
+    <!-- should have a higher priority than rfc822 - TIKA-3489 -->
+    <magic priority="55">
+      <match minShouldMatch="2">
+        <match value="user-agent:" type="stringignorecase" offset="0"/>
+        <match value="sitemap:" type="stringignorecase" offset="0"/>
+        <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
+        <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
+        <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
+        <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+      </match>
+    </magic>
+    <sub-class-of type="text/plain"/>
+  </mime-type>
   <mime-type type="application/vnd.ms-tnef">
     <alias type="application/ms-tnef" />
     <magic priority="50">
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index c765dae..f1c7939 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -22,6 +22,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import java.io.ByteArrayInputStream;
@@ -966,6 +967,11 @@ public class TestMimeTypes {
     }
 
     @Test
+    public void testRobots() throws Exception {
+        assertTypeByData("text/x-robots", "testRobots.txt");
+    }
+
+    @Test
     public void testMessageNews() throws Exception {
         assertTypeByData("message/news", "testMessageNews.txt");
     }
diff --git a/tika-parsers/src/test/resources/test-documents/testRobots.txt b/tika-parsers/src/test/resources/test-documents/testRobots.txt
new file mode 100644
index 0000000..a3e113c
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRobots.txt
@@ -0,0 +1,9 @@
+User-Agent: goodbot
+Disallow:
+
+User-Agent: badbot
+Disallow: /
+
+User-Agent: *
+Disallow: /forbidden/
+Allow: /
\ No newline at end of file