You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/10/11 14:59:16 UTC
[tika] branch master updated: TIKA-2469 -- narrow mime detection
for ms-owner files and add detection for nls files.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ad23d84 TIKA-2469 -- narrow mime detection for ms-owner files and add detection for nls files.
ad23d84 is described below
commit ad23d84f88abe6e540e989c988142f1ca0c876a7
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Oct 11 10:58:54 2017 -0400
TIKA-2469 -- narrow mime detection for ms-owner files and add detection for nls files.
---
CHANGES.txt | 3 +++
.../main/resources/org/apache/tika/mime/tika-mimetypes.xml | 13 ++++++++++++-
.../apache/tika/parser/microsoft/MSOwnerFileParser.java | 12 +++++++++++-
.../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 5 +++++
.../src/test/resources/test-documents/testNLS1.nls | Bin 0 -> 59 bytes
.../src/test/resources/test-documents/testNLS2.nls | Bin 0 -> 12 bytes
6 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index d9d0eb7..72ed77f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.17 - ???
+ * Narrow mime detection for ms-owner files and add detection
+ for .nls files (TIKA-2469).
+
* Fix bug in CharsetDetector that led to different detected charsets
depending on whether user setText with a byte[] or an InputStream
via Sean Story (TIKA-2475).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index dd5d066..3d4284d 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3608,8 +3608,19 @@
<mime-type type="application/x-ms-owner">
<_comment>Temporary files created by MSOffice applications</_comment>
<_comment>PRONOM fmt-473</_comment>
+ <_comment>First byte and 53rd byte are the same -- the length of the name.</_comment>
+ <_comment>Based on TIKA-2469, we've added a heuristic/wild guess that the first 10 chars</_comment>
+ <_comment>after the length byte should be \x00 or a non-control character.</_comment>
<magic priority="80">
- <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+ <match value="(?s)^([\\x05-\\x0F])[\\x00\\x20-\\x7E]{10}.{43}\\1\x00" type="regex" offset="0"/>
+ </magic>
+ </mime-type>
+ <mime-type type="application/x-ms-nls">
+ <_comment>Microsoft National Language Support</_comment>
+ <_comment>Should take precedence over x-ms-owner</_comment>
+ <magic priority="70">
+ <match value="(?s)^\\x0D.{51}\\x0C\\x00\\x0D\\x00\\x0E" type="regex" offset="0"/>
+ <match value="(?s)^\\x44\\x43.\\x01" type="regex" offset="0"/>
</magic>
</mime-type>
<mime-type type="application/x-ms-wmd">
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index c7019f2..df0cc73 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft;
import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -65,16 +66,25 @@ public class MSOwnerFileParser extends AbstractParser {
byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
IOUtils.readFully(stream, asciiNameBytes);
int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+ //sanity check name length
+ if (asciiNameLength < 0) {
+ throw new TikaException("ascii name length must be >= 0");
+ } else if (asciiNameLength > ASCII_CHUNK_LENGTH) {
+ throw new TikaException("ascii name length must be < 55");
+ }
+
String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
metadata.set(TikaCoreProperties.MODIFIER, asciiName);
int unicodeCharLength = stream.read();
- if (unicodeCharLength > 0) {
+ if (asciiNameLength == unicodeCharLength) {
stream.read();//zero after the char length
byte[] unicodeBytes = new byte[unicodeCharLength * 2];
IOUtils.readFully(stream, unicodeBytes);
String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
metadata.set(TikaCoreProperties.MODIFIER, unicodeName);
+ } else {
+ throw new TikaException("Ascii name length should be the same as the unicode length");
}
xhtml.endDocument();
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 058035c..e423fdd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1148,6 +1148,11 @@ public class TestMimeTypes {
assertTypeByData("text/plain", "testSAS.sas");
}
+ @Test
+ public void testNLS() throws Exception {
+ assertTypeByData("application/x-ms-nls", "testNLS1.nls");
+ assertTypeByData("application/x-ms-nls", "testNLS2.nls");
+ }
private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
diff --git a/tika-parsers/src/test/resources/test-documents/testNLS1.nls b/tika-parsers/src/test/resources/test-documents/testNLS1.nls
new file mode 100644
index 0000000..02f4e4e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNLS1.nls differ
diff --git a/tika-parsers/src/test/resources/test-documents/testNLS2.nls b/tika-parsers/src/test/resources/test-documents/testNLS2.nls
new file mode 100644
index 0000000..dca3334
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNLS2.nls differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].