You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 17:50:48 UTC

[tika] 01/03: TIKA-2648 detect interpreted server-side script languages

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f5a2faefd17936e1ad2c9b6b8c9b0ea3d3c30d99
Author: =?UTF-8?q?G=C3=A9rard=20Bouchar?= <gb...@protonmail.com>
AuthorDate: Fri Aug 3 13:10:53 2018 -0400

    TIKA-2648 detect interpreted server-side script languages
    
    mime detection based on resource name used to detect
    the mime-type of "http://example.com/test.php" as being "text/x-php"
    whereas given such an URL, the file extension doesn't give
    us any information about the mime type that will be returned
    by the server
---
 .../main/java/org/apache/tika/mime/MimeType.java   | 17 +++++++++++++++++
 .../main/java/org/apache/tika/mime/MimeTypes.java  | 16 +++++++++++-----
 .../java/org/apache/tika/mime/MimeTypesReader.java |  3 +++
 .../apache/tika/mime/MimeTypesReaderMetKeys.java   |  2 ++
 .../org/apache/tika/mime/tika-mimetypes.xml        | 12 ++++++------
 .../org/apache/tika/mime/CustomReaderTest.java     |  2 ++
 .../org/apache/tika/mime/MimeDetectionTest.java    | 22 ++++++++++++++++++++++
 .../org/apache/tika/mime/custom-mimetypes2.xml     |  2 +-
 8 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index b4d651e..d52c20b 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -111,6 +111,12 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
     private List<String> extensions = null;
 
     /**
+     * Whether this mime-type is used for server-side scripts,
+     * and thus cannot reliably be used for filename-based type detection
+     */
+    private boolean isInterpreted = false;
+
+    /**
      * Creates a media type with the give name and containing media type
      * registry. The name is expected to be valid and normalized to lower
      * case. This constructor should only be called by
@@ -303,6 +309,17 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
     }
 
     /**
+     * whether the type is used as a server-side scripting technology
+     */
+    boolean isInterpreted() {
+        return isInterpreted;
+    }
+
+    void setInterpreted(boolean interpreted) {
+        isInterpreted = interpreted;
+    }
+
+    /**
      * Defines a RootXML description. RootXML is made of a localName and/or a
      * namespaceURI.
      */
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 705ad3d..38c2ecc 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -500,10 +500,13 @@ public final class MimeTypes implements Detector, Serializable {
         String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
         if (resourceName != null) {
             String name = null;
+            boolean isHttp = false;
 
             // Deal with a URI or a path name in as the resource  name
             try {
                 URI uri = new URI(resourceName);
+                String scheme = uri.getScheme();
+                isHttp = scheme != null && scheme.startsWith("http"); // http or https
                 String path = uri.getPath();
                 if (path != null) {
                     int slash = path.lastIndexOf('/');
@@ -517,11 +520,14 @@ public final class MimeTypes implements Detector, Serializable {
 
             if (name != null) {
                 MimeType hint = getMimeType(name);
-                
-                // If we have some types based on mime magic, try to specialise
-                //  and/or select the type based on that
-                // Otherwise, use the type identified from the name
-                possibleTypes = applyHint(possibleTypes, hint);
+
+                // For server-side scripting languages, we cannot rely on the filename to detect the mime type
+                if (!(isHttp && hint.isInterpreted())) {
+                    // If we have some types based on mime magic, try to specialise
+                    //  and/or select the type based on that
+                    // Otherwise, use the type identified from the name
+                    possibleTypes = applyHint(possibleTypes, hint);
+                }
             }
         }
 
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
index ad7bd80..cfc030f 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
@@ -169,8 +169,11 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe
         if (type == null) {
             if (MIME_TYPE_TAG.equals(qName)) {
                 String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
+                String interpretedAttr = attributes.getValue(INTERPRETED_ATTR);
+                boolean interpreted = "true".equals(interpretedAttr);
                 try {
                     type = types.forName(name);
+                    type.setInterpreted(interpreted);
                 } catch (MimeTypeException e) {
                     handleMimeError(name, e, qName, attributes);
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
index 98bfee5..c77cc5c 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
@@ -27,6 +27,8 @@ public interface MimeTypesReaderMetKeys {
 
     String MIME_TYPE_TYPE_ATTR = "type";
 
+    String INTERPRETED_ATTR = "interpreted";
+
     String ACRONYM_TAG = "acronym";
 
     String COMMENT_TAG = "_comment";
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3c4b4ca..61a1634 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5932,13 +5932,13 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/asp">
+  <mime-type type="text/asp" interpreted="true">
     <_comment>Active Server Page</_comment>
     <glob pattern="*.asp"/>
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/aspdotnet">
+  <mime-type type="text/aspdotnet" interpreted="true">
     <_comment>ASP .NET</_comment>
     <glob pattern="*.aspx"/>
     <sub-class-of type="text/plain"/>
@@ -6327,7 +6327,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-cgi">
+  <mime-type type="text/x-cgi" interpreted="true">
     <_comment>CGI script</_comment>
     <glob pattern="*.cgi"/>
     <sub-class-of type="text/plain"/>
@@ -6381,7 +6381,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-coldfusion">
+  <mime-type type="text/x-coldfusion" interpreted="true">
     <_comment>ColdFusion source code</_comment>
     <glob pattern="*.cfm"/>
     <glob pattern="*.cfml"/>
@@ -6497,7 +6497,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-jsp">
+  <mime-type type="text/x-jsp" interpreted="true">
     <_comment>Java Server Page</_comment>
     <alias type="application/x-httpd-jsp"/>
     <sub-class-of type="text/plain"/>
@@ -6620,7 +6620,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-php">
+  <mime-type type="text/x-php" interpreted="true">
     <_comment>PHP script</_comment>
     <magic priority="50">
       <match value="&lt;?php" type="string" offset="0"/>
diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
index 8928727..df51d45 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
@@ -23,6 +23,7 @@ import java.util.Map;
 
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
@@ -89,6 +90,7 @@ public class CustomReaderTest {
     assertEquals(1, reader.ignorePatterns.size());
     assertEquals(another.toString()+">>*"+hello.getExtension(), 
         reader.ignorePatterns.get(0));
+    assertTrue("Server-side script type not detected", another.isInterpreted());
     
     //System.out.println( mimeTypes.getMediaTypeRegistry().getTypes() );
   }
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 6b16360..43eebd2 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -83,6 +83,21 @@ public class MimeDetectionTest {
     }
 
     @Test
+    public void testDetectionWithoutContent() throws IOException {
+        testUrlWithoutContent("text/html", "test.html");
+        testUrlWithoutContent("text/html", "http://test.com/test.html");
+        testUrlWithoutContent("text/plain", "http://test.com/test.txt");
+
+        // In case the url contains a filename referencing a server-side scripting language,
+        // it gives us no clue concerning the actual mime type of the response
+        testUrlWithoutContent("application/octet-stream", "http://test.com/test.php");
+        testUrlWithoutContent("application/octet-stream", "http://test.com/test.cgi");
+        testUrlWithoutContent("application/octet-stream", "http://test.com/test.jsp");
+        // But in case the protocol is not http or https, the script is probably not interpreted
+        testUrlWithoutContent("text/x-php", "ftp://test.com/test.php");
+    }
+
+    @Test
     public void testByteOrderMark() throws Exception {
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
                 new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)),
@@ -136,6 +151,13 @@ public class MimeDetectionTest {
         testStream(expected, url, in);
     }
 
+    private void testUrlWithoutContent(String expected, String url) throws IOException {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, url);
+        String mime = this.mimeTypes.detect(null, metadata).toString();
+        assertEquals(url + " is not properly detected using only resource name", expected, mime);
+    }
+
     private void testUrl(String expected, String url, String file) throws IOException{
         InputStream in = getClass().getResourceAsStream(file);
         testStream(expected, url, in);
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
index 2001d59..92d70cb 100644
--- a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
+++ b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
@@ -16,7 +16,7 @@
   limitations under the License.
 -->
 <mime-info>
-  <mime-type type="another/world-file">
+  <mime-type type="another/world-file" interpreted="true">
      <hello>kittens</hello>
      <glob pattern="*.hello.world" /> <!-- Will collide with 'hello/world-file'  -->
      <sub-class-of type="hello/world" />