You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/21 15:58:00 UTC
[tika] branch main updated: TIKA-3489 -- add mime detection for
robots.txt files
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5e2a3c0 TIKA-3489 -- add mime detection for robots.txt files
5e2a3c0 is described below
commit 5e2a3c081b3867086e417cb5cb032cb12be3c19d
Author: tballison <ta...@apache.org>
AuthorDate: Wed Jul 21 11:57:46 2021 -0400
TIKA-3489 -- add mime detection for robots.txt files
---
.../resources/org/apache/tika/mime/tika-mimetypes.xml | 16 ++++++++++++++++
.../test/java/org/apache/tika/mime/TestMimeTypes.java | 5 +++++
.../src/test/resources/test-documents/testRobots.txt | 9 +++++++++
3 files changed, 30 insertions(+)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 6c2ea14..3e8e432 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2042,6 +2042,22 @@
</magic>
<sub-class-of type="text/plain"/>
</mime-type>
+ <mime-type type="text/x-robots">
+ <!-- robots.txt file -->
+ <!-- draft: https://datatracker.ietf.org/doc/html/draft-koster-rep -->
+ <!-- should have a higher priority than rfc822 - TIKA-3489 -->
+ <magic priority="55">
+ <match minShouldMatch="2">
+ <match value="user-agent:" type="stringignorecase" offset="0"/>
+ <match value="sitemap:" type="stringignorecase" offset="0"/>
+ <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
+ <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
+ <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
+ <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+ </match>
+ </magic>
+ <sub-class-of type="text/plain"/>
+ </mime-type>
<mime-type type="application/vnd.ms-tnef">
<alias type="application/ms-tnef" />
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 9a6a997..5b6bae0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -955,6 +955,11 @@ public class TestMimeTypes {
}
@Test
+ public void testRobots() throws Exception {
+ assertTypeByData("text/x-robots", "testRobots.txt");
+ }
+
+ @Test
public void testMessageNews() throws Exception {
assertTypeByData("message/news", "testMessageNews.txt");
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots.txt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots.txt
new file mode 100644
index 0000000..a3e113c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots.txt
@@ -0,0 +1,9 @@
+User-Agent: goodbot
+Disallow:
+
+User-Agent: badbot
+Disallow: /
+
+User-Agent: *
+Disallow: /forbidden/
+Allow: /
\ No newline at end of file