You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 17:50:48 UTC
[tika] 01/03: TIKA-2648 detect interpreted server-side script
languages
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f5a2faefd17936e1ad2c9b6b8c9b0ea3d3c30d99
Author: =?UTF-8?q?G=C3=A9rard=20Bouchar?= <gb...@protonmail.com>
AuthorDate: Fri Aug 3 13:10:53 2018 -0400
TIKA-2648 detect interpreted server-side script languages
mime detection based on resource name used to detect
the mime-type of "http://example.com/test.php" as being "text/x-php"
whereas given such an URL, the file extension doesn't give
us any information about the mime type that will be returned
by the server
---
.../main/java/org/apache/tika/mime/MimeType.java | 17 +++++++++++++++++
.../main/java/org/apache/tika/mime/MimeTypes.java | 16 +++++++++++-----
.../java/org/apache/tika/mime/MimeTypesReader.java | 3 +++
.../apache/tika/mime/MimeTypesReaderMetKeys.java | 2 ++
.../org/apache/tika/mime/tika-mimetypes.xml | 12 ++++++------
.../org/apache/tika/mime/CustomReaderTest.java | 2 ++
.../org/apache/tika/mime/MimeDetectionTest.java | 22 ++++++++++++++++++++++
.../org/apache/tika/mime/custom-mimetypes2.xml | 2 +-
8 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index b4d651e..d52c20b 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -111,6 +111,12 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
private List<String> extensions = null;
/**
+ * Whether this mime-type is used for server-side scripts,
+ * and thus cannot reliably be used for filename-based type detection
+ */
+ private boolean isInterpreted = false;
+
+ /**
* Creates a media type with the give name and containing media type
* registry. The name is expected to be valid and normalized to lower
* case. This constructor should only be called by
@@ -303,6 +309,17 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
}
/**
+ * whether the type is used as a server-side scripting technology
+ */
+ boolean isInterpreted() {
+ return isInterpreted;
+ }
+
+ void setInterpreted(boolean interpreted) {
+ isInterpreted = interpreted;
+ }
+
+ /**
* Defines a RootXML description. RootXML is made of a localName and/or a
* namespaceURI.
*/
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 705ad3d..38c2ecc 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -500,10 +500,13 @@ public final class MimeTypes implements Detector, Serializable {
String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (resourceName != null) {
String name = null;
+ boolean isHttp = false;
// Deal with a URI or a path name in as the resource name
try {
URI uri = new URI(resourceName);
+ String scheme = uri.getScheme();
+ isHttp = scheme != null && scheme.startsWith("http"); // http or https
String path = uri.getPath();
if (path != null) {
int slash = path.lastIndexOf('/');
@@ -517,11 +520,14 @@ public final class MimeTypes implements Detector, Serializable {
if (name != null) {
MimeType hint = getMimeType(name);
-
- // If we have some types based on mime magic, try to specialise
- // and/or select the type based on that
- // Otherwise, use the type identified from the name
- possibleTypes = applyHint(possibleTypes, hint);
+
+ // For server-side scripting languages, we cannot rely on the filename to detect the mime type
+ if (!(isHttp && hint.isInterpreted())) {
+ // If we have some types based on mime magic, try to specialise
+ // and/or select the type based on that
+ // Otherwise, use the type identified from the name
+ possibleTypes = applyHint(possibleTypes, hint);
+ }
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
index ad7bd80..cfc030f 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
@@ -169,8 +169,11 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe
if (type == null) {
if (MIME_TYPE_TAG.equals(qName)) {
String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
+ String interpretedAttr = attributes.getValue(INTERPRETED_ATTR);
+ boolean interpreted = "true".equals(interpretedAttr);
try {
type = types.forName(name);
+ type.setInterpreted(interpreted);
} catch (MimeTypeException e) {
handleMimeError(name, e, qName, attributes);
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
index 98bfee5..c77cc5c 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
@@ -27,6 +27,8 @@ public interface MimeTypesReaderMetKeys {
String MIME_TYPE_TYPE_ATTR = "type";
+ String INTERPRETED_ATTR = "interpreted";
+
String ACRONYM_TAG = "acronym";
String COMMENT_TAG = "_comment";
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3c4b4ca..61a1634 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5932,13 +5932,13 @@
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/asp">
+ <mime-type type="text/asp" interpreted="true">
<_comment>Active Server Page</_comment>
<glob pattern="*.asp"/>
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/aspdotnet">
+ <mime-type type="text/aspdotnet" interpreted="true">
<_comment>ASP .NET</_comment>
<glob pattern="*.aspx"/>
<sub-class-of type="text/plain"/>
@@ -6327,7 +6327,7 @@
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/x-cgi">
+ <mime-type type="text/x-cgi" interpreted="true">
<_comment>CGI script</_comment>
<glob pattern="*.cgi"/>
<sub-class-of type="text/plain"/>
@@ -6381,7 +6381,7 @@
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/x-coldfusion">
+ <mime-type type="text/x-coldfusion" interpreted="true">
<_comment>ColdFusion source code</_comment>
<glob pattern="*.cfm"/>
<glob pattern="*.cfml"/>
@@ -6497,7 +6497,7 @@
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/x-jsp">
+ <mime-type type="text/x-jsp" interpreted="true">
<_comment>Java Server Page</_comment>
<alias type="application/x-httpd-jsp"/>
<sub-class-of type="text/plain"/>
@@ -6620,7 +6620,7 @@
<sub-class-of type="text/plain"/>
</mime-type>
- <mime-type type="text/x-php">
+ <mime-type type="text/x-php" interpreted="true">
<_comment>PHP script</_comment>
<magic priority="50">
<match value="<?php" type="string" offset="0"/>
diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
index 8928727..df51d45 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -89,6 +90,7 @@ public class CustomReaderTest {
assertEquals(1, reader.ignorePatterns.size());
assertEquals(another.toString()+">>*"+hello.getExtension(),
reader.ignorePatterns.get(0));
+ assertTrue("Server-side script type not detected", another.isInterpreted());
//System.out.println( mimeTypes.getMediaTypeRegistry().getTypes() );
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 6b16360..43eebd2 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -83,6 +83,21 @@ public class MimeDetectionTest {
}
@Test
+ public void testDetectionWithoutContent() throws IOException {
+ testUrlWithoutContent("text/html", "test.html");
+ testUrlWithoutContent("text/html", "http://test.com/test.html");
+ testUrlWithoutContent("text/plain", "http://test.com/test.txt");
+
+ // In case the url contains a filename referencing a server-side scripting language,
+ // it gives us no clue concerning the actual mime type of the response
+ testUrlWithoutContent("application/octet-stream", "http://test.com/test.php");
+ testUrlWithoutContent("application/octet-stream", "http://test.com/test.cgi");
+ testUrlWithoutContent("application/octet-stream", "http://test.com/test.jsp");
+ // But in case the protocol is not http or https, the script is probably not interpreted
+ testUrlWithoutContent("text/x-php", "ftp://test.com/test.php");
+ }
+
+ @Test
public void testByteOrderMark() throws Exception {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)),
@@ -136,6 +151,13 @@ public class MimeDetectionTest {
testStream(expected, url, in);
}
+ private void testUrlWithoutContent(String expected, String url) throws IOException {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, url);
+ String mime = this.mimeTypes.detect(null, metadata).toString();
+ assertEquals(url + " is not properly detected using only resource name", expected, mime);
+ }
+
private void testUrl(String expected, String url, String file) throws IOException{
InputStream in = getClass().getResourceAsStream(file);
testStream(expected, url, in);
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
index 2001d59..92d70cb 100644
--- a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
+++ b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
@@ -16,7 +16,7 @@
limitations under the License.
-->
<mime-info>
- <mime-type type="another/world-file">
+ <mime-type type="another/world-file" interpreted="true">
<hello>kittens</hello>
<glob pattern="*.hello.world" /> <!-- Will collide with 'hello/world-file' -->
<sub-class-of type="hello/world" />