You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/15 13:22:27 UTC

tika git commit: TIKA-2008 -- add mime definition and parser for MSOwnerFile

Repository: tika
Updated Branches:
  refs/heads/2.x ffaa4deaa -> 60d4e3ff2


TIKA-2008 -- add mime definition and parser for MSOwnerFile


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/60d4e3ff
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/60d4e3ff
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/60d4e3ff

Branch: refs/heads/2.x
Commit: 60d4e3ff2aca931fd8e36d0f8ca8c2944e788aa4
Parents: ffaa4de
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:22:18 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:22:18 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../org/apache/tika/mime/TestMimeTypes.java     |   5 ++
 .../org/apache/tika/mime/tika-mimetypes.xml     |   7 ++
 .../org/apache/tika/module/office/BundleIT.java |   2 +-
 .../parser/microsoft/MSOwnerFileParser.java     |  81 +++++++++++++++++++
 .../services/org.apache.tika.parser.Parser      |   1 +
 .../parser/microsoft/MSOwnerFileParserTest.java |  31 +++++++
 .../resources/test-documents/testMSOwnerFile    | Bin 0 -> 162 bytes
 8 files changed, 128 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1d8f2cc..81243fe 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ Release 2.0 - Future Development
 
 Release 1.14 - ???
 
+  * Add mime definition and parser for MS Owner File (TIKA-2008).
+
   * Add mime definition for Windows Media Metafile (TIKA-2004).
 
   * Add mime definitions of iCal and vCalendar (TIKA-2006).

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index eed11e8..d27c714 100644
--- a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -981,6 +981,11 @@ public class TestMimeTypes extends TikaTest {
         assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
     }
 
+    @Test
+    public void testMSOwner() throws Exception {
+        assertType("application/x-ms-owner", "testMSOwnerFile");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index c513361..9ec8d76 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3403,6 +3403,13 @@
   <mime-type type="application/x-ms-application">
     <glob pattern="*.application"/>
   </mime-type>
+  <mime-type type="application/x-ms-owner">
+    <_comment>Temporary files created by MSOffice applications</_comment>
+    <_comment>PRONOM fmt-473</_comment>
+    <magic priority="80">
+      <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-ms-wmd">
     <glob pattern="*.wmd"/>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
index 3f564fe..6336ddf 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
+++ b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
@@ -80,6 +80,6 @@ public class BundleIT {
     @Test
     public void testServicesCreated() throws Exception {
         ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 24, services.length);
+        assertEquals("Not all Services have started", 25, services.length);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
new file mode 100644
index 0000000..02c07a6
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for temporary MSOFfice files.
+ * This currently only extracts the owner's name.
+ */
+public class MSOwnerFileParser extends AbstractParser {
+
+    private static final int ASCII_CHUNK_LENGTH = 54;
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts owner from MS temp file
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
+        IOUtils.readFully(stream, asciiNameBytes);
+        int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+        String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
+        metadata.set(TikaCoreProperties.CREATOR, asciiName);
+
+        int unicodeCharLength = stream.read();
+        if (unicodeCharLength > 0) {
+            stream.read();//zero after the char length
+            byte[] unicodeBytes = new byte[unicodeCharLength * 2];
+            IOUtils.readFully(stream, unicodeBytes);
+            String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
+            metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+        }
+        xhtml.endDocument();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 4d3290e..1c8cee1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -19,6 +19,7 @@ org.apache.tika.parser.microsoft.JackcessParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
 org.apache.tika.parser.microsoft.xml.WordMLParser
 org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
new file mode 100644
index 0000000..3cef3df
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class MSOwnerFileParserTest extends TikaTest {
+    @Test
+    public void testBasic() throws Exception {
+        XMLResult r = getXML("testMSOwnerFile");
+        assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile
new file mode 100644
index 0000000..72a5f57
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile differ