You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/18 14:44:08 UTC
[tika] branch main updated: TIKA-3769 -- improve marc mime detection
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 750690297 TIKA-3769 -- improve marc mime detection
new 4f8134da9 Merge remote-tracking branch 'origin/main' into main
750690297 is described below
commit 750690297ce38cb270d36ebee733c566a07cf4b9
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 18 10:43:47 2022 -0400
TIKA-3769 -- improve marc mime detection
---
.../org/apache/tika/mime/tika-mimetypes.xml | 22 ++++++++++++----------
.../java/org/apache/tika/mime/TestMimeTypes.java | 7 +++++++
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a2b734cc8..43d7820d3 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -440,16 +440,18 @@
<magic priority="50">
<!-- built from, e.g. https://www.loc.gov/marc/community/cileader.html -->
<match value="[0-9]{5,5}" type="regex" offset="0">
- <!-- bibliographic -->
- <match value="[acdnp][acdefgijkmoprt][abcdims]" type="regex" offset="5"/>
- <!-- authority-->
- <match value="[acdnosx]z" type="regex" offset="5"/>
- <!-- holdings -->
- <match value="[cdn][uvxy]" type="regex" offset="5"/>
- <!-- classification -->
- <match value="[acdn]w" type="regex" offset="5"/>
- <!-- community -->
- <match value="[cdn]q" type="regex" offset="5"/>
+ <match value="45" type="string" offset="20">
+ <!-- bibliographic -->
+ <match value="[acdnp][acdefgijkmoprt][abcdims]" type="regex" offset="5"/>
+ <!-- authority-->
+ <match value="[acdnosx]z" type="regex" offset="5"/>
+ <!-- holdings -->
+ <match value="[cdn][uvxy]" type="regex" offset="5"/>
+ <!-- classification -->
+ <match value="[acdn]w" type="regex" offset="5"/>
+ <!-- community -->
+ <match value="[cdn]q" type="regex" offset="5"/>
+ </match>
</match>
</magic>
</mime-type>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 9cccd08e8..a7aaada50 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -71,6 +71,13 @@ public class TestMimeTypes {
assertNotNull(repo.forName("text/x-tex"));
}
+ @Test
+ public void testTextNotMarc() throws Exception {
+ //TIKA-3769
+ String md5 = "89148cea02eff4bb856183b4506bb9d8";
+ assertTypeByData("text/plain", md5.getBytes(UTF_8));
+ }
+
/**
* Tests MIME type determination based solely on the URL's extension.
*/