You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/15 17:33:13 UTC

[03/10] tika git commit: Add mime detection and parser for Microsoft Owner File (TIKA-2008).

Add mime detection and parser for Microsoft Owner File (TIKA-2008).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01a9b6db
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01a9b6db
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01a9b6db

Branch: refs/heads/TIKA-1508
Commit: 01a9b6db5ac20a63f2b9de9c15de1b12ee2bde06
Parents: d405172
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:20:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:20:29 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |   7 ++
 .../parser/microsoft/MSOwnerFileParser.java     |  81 +++++++++++++++++++
 .../services/org.apache.tika.parser.Parser      |   1 +
 .../org/apache/tika/mime/TestMimeTypes.java     |   5 ++
 .../parser/microsoft/MSOwnerFileParserTest.java |  31 +++++++
 .../resources/test-documents/testMSOwnerFile    | Bin 0 -> 162 bytes
 7 files changed, 127 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d244bd4..3847d72 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Add mime definition and parser for MS Owner File (TIKA-2008).
+
   * Add mime definition for Windows Media Metafile (TIKA-2004).
 
   * Add mime definitions of iCal and vCalendar (TIKA-2006).

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 210ce0c..82df034 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3468,6 +3468,13 @@
   <mime-type type="application/x-ms-application">
     <glob pattern="*.application"/>
   </mime-type>
+  <mime-type type="application/x-ms-owner">
+    <_comment>Temporary files created by MSOffice applications</_comment>
+    <_comment>PRONOM fmt-473</_comment>
+    <magic priority="80">
+      <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-ms-wmd">
     <glob pattern="*.wmd"/>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
new file mode 100644
index 0000000..02c07a6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for temporary MSOFfice files.
+ * This currently only extracts the owner's name.
+ */
+public class MSOwnerFileParser extends AbstractParser {
+
+    private static final int ASCII_CHUNK_LENGTH = 54;
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts owner from MS temp file
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
+        IOUtils.readFully(stream, asciiNameBytes);
+        int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+        String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
+        metadata.set(TikaCoreProperties.CREATOR, asciiName);
+
+        int unicodeCharLength = stream.read();
+        if (unicodeCharLength > 0) {
+            stream.read();//zero after the char length
+            byte[] unicodeBytes = new byte[unicodeCharLength * 2];
+            IOUtils.readFully(stream, unicodeBytes);
+            String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
+            metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+        }
+        xhtml.endDocument();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index acb0224..10a5a7e 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -36,6 +36,7 @@ org.apache.tika.parser.mail.RFC822Parser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.mbox.OutlookPSTParser
 org.apache.tika.parser.microsoft.JackcessParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 5dda858..bc83678 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1043,6 +1043,11 @@ public class TestMimeTypes {
         assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
     }
 
+    @Test
+    public void testMSOwner() throws Exception {
+        assertType("application/x-ms-owner", "testMSOwnerFile");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
new file mode 100644
index 0000000..3cef3df
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class MSOwnerFileParserTest extends TikaTest {
+    @Test
+    public void testBasic() throws Exception {
+        XMLResult r = getXML("testMSOwnerFile");
+        assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/01a9b6db/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testMSOwnerFile b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile
new file mode 100644
index 0000000..72a5f57
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMSOwnerFile differ