You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/10/11 14:59:16 UTC

[tika] branch master updated: TIKA-2469 -- narrow mime detection for ms-owner files and add detection for nls files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new ad23d84  TIKA-2469 -- narrow mime detection for ms-owner files and add detection for nls files.
ad23d84 is described below

commit ad23d84f88abe6e540e989c988142f1ca0c876a7
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Oct 11 10:58:54 2017 -0400

    TIKA-2469 -- narrow mime detection for ms-owner files and add detection for nls files.
---
 CHANGES.txt                                                |   3 +++
 .../main/resources/org/apache/tika/mime/tika-mimetypes.xml |  13 ++++++++++++-
 .../apache/tika/parser/microsoft/MSOwnerFileParser.java    |  12 +++++++++++-
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java  |   5 +++++
 .../src/test/resources/test-documents/testNLS1.nls         | Bin 0 -> 59 bytes
 .../src/test/resources/test-documents/testNLS2.nls         | Bin 0 -> 12 bytes
 6 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index d9d0eb7..72ed77f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.17 - ???
 
+  * Narrow mime detection for ms-owner files and add detection
+    for .nls files (TIKA-2469).
+
   * Fix bug in CharsetDetector that led to different detected charsets
     depending on whether user setText with a byte[] or an InputStream
     via Sean Story (TIKA-2475).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index dd5d066..3d4284d 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3608,8 +3608,19 @@
   <mime-type type="application/x-ms-owner">
     <_comment>Temporary files created by MSOffice applications</_comment>
     <_comment>PRONOM fmt-473</_comment>
+    <_comment>First byte and 53rd byte are the same -- the length of the name.</_comment>
+    <_comment>Based on TIKA-2469, we've added a heuristic/wild guess that the first 10 chars</_comment>
+    <_comment>after the length byte should be \x00 or a non-control character.</_comment>
     <magic priority="80">
-      <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+      <match value="(?s)^([\\x05-\\x0F])[\\x00\\x20-\\x7E]{10}.{43}\\1\x00" type="regex" offset="0"/>
+    </magic>
+  </mime-type>
+  <mime-type type="application/x-ms-nls">
+    <_comment>Microsoft National Language Support</_comment>
+    <_comment>Should take precedence over x-ms-owner</_comment>
+    <magic priority="70">
+      <match value="(?s)^\\x0D.{51}\\x0C\\x00\\x0D\\x00\\x0E" type="regex" offset="0"/>
+      <match value="(?s)^\\x44\\x43.\\x01" type="regex" offset="0"/>
     </magic>
   </mime-type>
   <mime-type type="application/x-ms-wmd">
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index c7019f2..df0cc73 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.microsoft;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -65,16 +66,25 @@ public class MSOwnerFileParser extends AbstractParser {
         byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
         IOUtils.readFully(stream, asciiNameBytes);
         int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+        //sanity check name length
+        if (asciiNameLength < 0) {
+            throw new TikaException("ascii name length must be >= 0");
+        } else if (asciiNameLength > ASCII_CHUNK_LENGTH) {
+            throw new TikaException("ascii name length must be < 55");
+        }
+
         String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
         metadata.set(TikaCoreProperties.MODIFIER, asciiName);
 
         int unicodeCharLength = stream.read();
-        if (unicodeCharLength > 0) {
+        if (asciiNameLength == unicodeCharLength) {
             stream.read();//zero after the char length
             byte[] unicodeBytes = new byte[unicodeCharLength * 2];
             IOUtils.readFully(stream, unicodeBytes);
             String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
             metadata.set(TikaCoreProperties.MODIFIER, unicodeName);
+        } else {
+            throw new TikaException("Ascii name length should be the same as the unicode length");
         }
         xhtml.endDocument();
     }
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 058035c..e423fdd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1148,6 +1148,11 @@ public class TestMimeTypes {
         assertTypeByData("text/plain", "testSAS.sas");
     }
 
+    @Test
+    public void testNLS() throws Exception {
+        assertTypeByData("application/x-ms-nls", "testNLS1.nls");
+        assertTypeByData("application/x-ms-nls", "testNLS2.nls");
+    }
 
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
diff --git a/tika-parsers/src/test/resources/test-documents/testNLS1.nls b/tika-parsers/src/test/resources/test-documents/testNLS1.nls
new file mode 100644
index 0000000..02f4e4e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNLS1.nls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testNLS2.nls b/tika-parsers/src/test/resources/test-documents/testNLS2.nls
new file mode 100644
index 0000000..dca3334
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNLS2.nls differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].