You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/15 17:33:13 UTC
[03/10] tika git commit: Add mime detection and parser for Microsoft
Owner File (TIKA-2008).
Add mime detection and parser for Microsoft Owner File (TIKA-2008).
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01a9b6db
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01a9b6db
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01a9b6db
Branch: refs/heads/TIKA-1508
Commit: 01a9b6db5ac20a63f2b9de9c15de1b12ee2bde06
Parents: d405172
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:20:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:20:29 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 7 ++
.../parser/microsoft/MSOwnerFileParser.java | 81 +++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 1 +
.../org/apache/tika/mime/TestMimeTypes.java | 5 ++
.../parser/microsoft/MSOwnerFileParserTest.java | 31 +++++++
.../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes
7 files changed, 127 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d244bd4..3847d72 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.14 - ???
+ * Add mime definition and parser for MS Owner File (TIKA-2008).
+
* Add mime definition for Windows Media Metafile (TIKA-2004).
* Add mime definitions of iCal and vCalendar (TIKA-2006).
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 210ce0c..82df034 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3468,6 +3468,13 @@
<mime-type type="application/x-ms-application">
<glob pattern="*.application"/>
</mime-type>
+ <mime-type type="application/x-ms-owner">
+ <_comment>Temporary files created by MSOffice applications</_comment>
+ <_comment>PRONOM fmt-473</_comment>
+ <magic priority="80">
+ <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+ </magic>
+ </mime-type>
<mime-type type="application/x-ms-wmd">
<glob pattern="*.wmd"/>
</mime-type>
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
new file mode 100644
index 0000000..02c07a6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for temporary MSOFfice files.
+ * This currently only extracts the owner's name.
+ */
+public class MSOwnerFileParser extends AbstractParser {
+
+ private static final int ASCII_CHUNK_LENGTH = 54;
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -752276948656079347L;
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MEDIA_TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts owner from MS temp file
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
+ IOUtils.readFully(stream, asciiNameBytes);
+ int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+ String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
+ metadata.set(TikaCoreProperties.CREATOR, asciiName);
+
+ int unicodeCharLength = stream.read();
+ if (unicodeCharLength > 0) {
+ stream.read();//zero after the char length
+ byte[] unicodeBytes = new byte[unicodeCharLength * 2];
+ IOUtils.readFully(stream, unicodeBytes);
+ String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
+ metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+ }
+ xhtml.endDocument();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index acb0224..10a5a7e 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -36,6 +36,7 @@ org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
org.apache.tika.parser.mbox.OutlookPSTParser
org.apache.tika.parser.microsoft.JackcessParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 5dda858..bc83678 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1043,6 +1043,11 @@ public class TestMimeTypes {
assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
}
+ @Test
+ public void testMSOwner() throws Exception {
+ assertType("application/x-ms-owner", "testMSOwnerFile");
+ }
+
private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
new file mode 100644
index 0000000..3cef3df
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class MSOwnerFileParserTest extends TikaTest {
+ @Test
+ public void testBasic() throws Exception {
+ XMLResult r = getXML("testMSOwnerFile");
+ assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testMSOwnerFile b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
new file mode 100644
index 0000000..72a5f57
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile differ