You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/18 14:44:08 UTC

[tika] branch main updated: TIKA-3769 -- improve marc mime detection

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 750690297 TIKA-3769 -- improve marc mime detection
     new 4f8134da9 Merge remote-tracking branch 'origin/main' into main
750690297 is described below

commit 750690297ce38cb270d36ebee733c566a07cf4b9
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 18 10:43:47 2022 -0400

    TIKA-3769 -- improve marc mime detection
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 22 ++++++++++++----------
 .../java/org/apache/tika/mime/TestMimeTypes.java   |  7 +++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a2b734cc8..43d7820d3 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -440,16 +440,18 @@
     <magic priority="50">
       <!-- built from, e.g. https://www.loc.gov/marc/community/cileader.html -->
       <match value="[0-9]{5,5}" type="regex" offset="0">
-        <!-- bibliographic -->
-        <match value="[acdnp][acdefgijkmoprt][abcdims]" type="regex" offset="5"/>
-        <!-- authority-->
-        <match value="[acdnosx]z" type="regex" offset="5"/>
-        <!-- holdings -->
-        <match value="[cdn][uvxy]" type="regex" offset="5"/>
-        <!-- classification -->
-        <match value="[acdn]w" type="regex" offset="5"/>
-        <!-- community -->
-        <match value="[cdn]q" type="regex" offset="5"/>
+        <match value="45" type="string" offset="20">
+          <!-- bibliographic -->
+          <match value="[acdnp][acdefgijkmoprt][abcdims]" type="regex" offset="5"/>
+          <!-- authority-->
+          <match value="[acdnosx]z" type="regex" offset="5"/>
+          <!-- holdings -->
+          <match value="[cdn][uvxy]" type="regex" offset="5"/>
+          <!-- classification -->
+          <match value="[acdn]w" type="regex" offset="5"/>
+          <!-- community -->
+          <match value="[cdn]q" type="regex" offset="5"/>
+        </match>
       </match>
     </magic>
   </mime-type>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 9cccd08e8..a7aaada50 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -71,6 +71,13 @@ public class TestMimeTypes {
         assertNotNull(repo.forName("text/x-tex"));
     }
 
+    @Test
+    public void testTextNotMarc() throws Exception {
+        //TIKA-3769
+        String md5 = "89148cea02eff4bb856183b4506bb9d8";
+        assertTypeByData("text/plain", md5.getBytes(UTF_8));
+    }
+
     /**
      * Tests MIME type determination based solely on the URL's extension.
      */