You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/24 15:57:03 UTC

(tika) branch main updated: [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ae737cd26 [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556)
ae737cd26 is described below

commit ae737cd2625b5e2659c20c27713785df8bfc1957
Author: Marcos Pereira <ma...@gmail.com>
AuthorDate: Wed Jan 24 10:56:58 2024 -0500

    [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript" to follow RFC-9239 (#1556)
    
    * [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript"
    
    Following RFC 9239. This also adds support for ".msj" ( as documented in the RFC).
---
 CHANGES.txt                                        |  3 ++
 README.md                                          |  6 +--
 .../org/apache/tika/mime/tika-mimetypes.xml        |  9 +++--
 .../java/org/apache/tika/TikaDetectionTest.java    |  3 +-
 .../org/apache/tika/mime/MimeTypesReaderTest.java  | 43 ++++++++++++++--------
 .../tika/mime/ProbabilisticMimeDetectionTest.java  |  2 +-
 .../java/org/apache/tika/mime/TestMimeTypes.java   | 12 +++---
 7 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9953e46d3..f9ac540e6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -16,6 +16,9 @@ Release 3.0.0-BETA - 12/01/2023
    * Tika will look for "custom-mimetypes.xml" directly on the classpath, NOT
      under "/org/apache/tika/mime/". (TIKA-4147).
 
+   * Return media type "text/javascript" instead of "application/javascript"
+     to follow RFC-9239. (TIKA-4119).
+
    Other Changes/Updates
 
    * Upgrade PDFBox to 3.0.1 (TIKA-3347)
diff --git a/README.md b/README.md
index 97d709656..88fe59cb8 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Tika jars can be fetched from Maven Central or your favourite Maven mirror.
 
 **Tika 1.X reached End of Life (EOL) on September 30, 2022.**  
 
-Tika is based on **Java 8** and uses the [Maven 3](https://maven.apache.org) build system. 
+Tika is based on **Java 11** and uses the [Maven 3](https://maven.apache.org) build system. 
 **N.B.** [Docker](https://www.docker.com/products/personal) is used for tests in tika-integration-tests.
 As of Tika 2.5.1, if Docker is not installed, those tests are skipped.  Docker is required for a successful
 build on earlier 2.x versions.
@@ -50,7 +50,7 @@ Maven Dependencies
 
 Apache Tika provides *Bill of Material* (BOM) artifact to align Tika module versions and simplify version management. 
 To avoid convergence errors in your own project, import this
-bom or Tika's parent pom.xml in your dependencey management section.
+bom or Tika's parent pom.xml in your dependency management section.
 
 If you use Apache Maven:
 
@@ -170,7 +170,7 @@ Notification on all code changes are sent to the following mailing list:
 The mailing lists are open to anyone and publicly archived.
 
 You can subscribe the mailing lists by sending a message to 
-[LIST]-subscribe@tika.apache.org (for example user-subscribe@...).  
+[LIST]-subscribe@tika.apache.org (for example, user-subscribe@...).  
 To unsubscribe, send a message to [LIST]-unsubscribe@tika.apache.org.  
 For more instructions, send a message to [LIST]-help@tika.apache.org.
 
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index b76adebd1..54f7cc6f6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -366,15 +366,18 @@
     <glob pattern="*.ser"/>
   </mime-type>
 
-  <mime-type type="application/javascript">
+  <mime-type type="text/javascript">
+    <alias type="application/javascript"/>
     <alias type="application/x-javascript"/>
-    <alias type="text/javascript"/>
     <sub-class-of type="text/plain"/>
     <_comment>JavaScript Source Code</_comment>
+    <!-- From RFC 9239: https://www.rfc-editor.org/rfc/rfc9239.html#name-text-javascript -->
+    <!-- File extension(s): .js, .mjs -->
     <glob pattern="*.js"/>
+    <glob pattern="*.mjs"/>
 
     <!-- Note - there is no Unique Magic for JavaScript files! -->
-    <!-- Generally you can only detect JS with the filename -->
+    <!-- Generally, you can only detect JS with the filename -->
     <!-- However... A few common JS libraries accidentally trigger -->
     <!--  the HTML priority=20 magic incorrectly. So, for those only, -->
     <!--  we list "magic" for those specific files -->
diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
index 1cd0f40a2..215865886 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
@@ -56,7 +56,8 @@ public class TikaDetectionTest {
         assertEquals("application/java-archive", tika.detect("x.jar"));
         assertEquals("application/java-serialized-object", tika.detect("x.ser"));
         assertEquals("application/java-vm", tika.detect("x.class"));
-        assertEquals("application/javascript", tika.detect("x.js"));
+        assertEquals("text/javascript", tika.detect("x.js"));
+        assertEquals("text/javascript", tika.detect("x.mjs"));
         assertEquals("application/json", tika.detect("x.json"));
         assertEquals("application/lost+xml", tika.detect("x.lostxml"));
         assertEquals("application/mac-binhex40", tika.detect("x.hqx"));
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
index 7d576d73a..0d904f6df 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
@@ -31,6 +31,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.Executors;
+import java.util.stream.Collectors;
 
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
@@ -279,6 +280,24 @@ public class MimeTypesReaderTest {
         assertEquals(".ppt", mt.getExtensions().get(0));
     }
 
+    @Test
+    public void testGetExtensionForJavaScript() throws Exception {
+        MimeType mt = this.mimeTypes.forName("text/javascript");
+        assertEquals(".js", mt.getExtension());
+        assertEquals(List.of(".js", ".mjs"), mt.getExtensions());
+    }
+
+    @Test
+    public void testGetAliasForJavaScript() throws Exception {
+        MimeType mt = this.mimeTypes.forName("text/javascript");
+        Set<String> aliases = mimeTypes.getMediaTypeRegistry()
+                .getAliases(mt.getType())
+                .stream()
+                .map(MediaType::toString)
+                .collect(Collectors.toSet());
+        assertEquals(Set.of("application/javascript", "application/x-javascript"), aliases);
+    }
+
     @Test
     public void testGetRegisteredMimesWithParameters() throws Exception {
         //TIKA-1692
@@ -351,40 +370,32 @@ public class MimeTypesReaderTest {
     }
 
     @Test
-    public void testBadMinShouldMatch1() throws Exception {
+    public void testBadMinShouldMatch1() {
         System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
                 "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml");
 
-        assertThrows(IllegalArgumentException.class, () -> {
-            MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
-        });
+        assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
     }
 
     @Test
-    public void testBadMinShouldMatch2() throws Exception {
+    public void testBadMinShouldMatch2() {
         System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
                 "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml");
-        assertThrows(IllegalArgumentException.class, () -> {
-            MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
-        });
+        assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
     }
 
     @Test
-    public void testBadMinShouldMatch3() throws Exception {
+    public void testBadMinShouldMatch3() {
         System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
                 "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml");
-        assertThrows(IllegalArgumentException.class, () -> {
-            MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
-        });
+        assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
     }
 
     @Test
-    public void testBadMinShouldMatch4() throws Exception {
+    public void testBadMinShouldMatch4() {
         System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
                 "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml");
-        assertThrows(IllegalArgumentException.class, () -> {
-            MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
-        });
+        assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
     }
 
     private static class CustomClassLoader extends ClassLoader {
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
index 437cefc6d..d9a65bf1b 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
@@ -246,6 +246,6 @@ public class ProbabilisticMimeDetectionTest {
                         "} catch (e) {\n" + "    console.log(e);\n" + "}")
                         .getBytes(StandardCharsets.UTF_8));
         MediaType detect = new ProbabilisticMimeDetectionSelector().detect(input, metadata);
-        assertEquals(MediaType.application("javascript"), detect);
+        assertEquals(MediaType.text("javascript"), detect);
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 3dad7d6af..1b66a7efe 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -718,8 +718,8 @@ public class TestMimeTypes {
         assertTypeByName("text/html", "testHTML.html");
         assertType("text/html", "testHTML.html");
 
-        assertTypeByName("application/javascript", "testJS.js");
-        assertType("application/javascript", "testJS.js");
+        assertTypeByName("text/javascript", "testJS.js");
+        assertType("text/javascript", "testJS.js");
 
         assertType("text/vnd.graphviz", "testGRAPHVIZd.dot");
         assertType("text/vnd.graphviz", "testGRAPHVIZg.dot");
@@ -1148,10 +1148,10 @@ public class TestMimeTypes {
         assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
 
         // By name, or by name+data, gets it as JS
-        assertTypeByName("application/javascript", "testJS.js");
-        assertTypeByName("application/javascript", "testJS_HTML.js");
-        assertType("application/javascript", "testJS.js");
-        assertType("application/javascript", "testJS_HTML.js");
+        assertTypeByName("text/javascript", "testJS.js");
+        assertTypeByName("text/javascript", "testJS_HTML.js");
+        assertType("text/javascript", "testJS.js");
+        assertType("text/javascript", "testJS_HTML.js");
 
         // With data only, because we have no JS file magic, can't be
         //  detected. One will come through as plain text, the other