You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/15 13:22:27 UTC
tika git commit: TIKA-2008 -- add mime definition and parser for
MSOwnerFile
Repository: tika
Updated Branches:
refs/heads/2.x ffaa4deaa -> 60d4e3ff2
TIKA-2008 -- add mime definition and parser for MSOwnerFile
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/60d4e3ff
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/60d4e3ff
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/60d4e3ff
Branch: refs/heads/2.x
Commit: 60d4e3ff2aca931fd8e36d0f8ca8c2944e788aa4
Parents: ffaa4de
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 15 09:22:18 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 15 09:22:18 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 +
.../org/apache/tika/mime/TestMimeTypes.java | 5 ++
.../org/apache/tika/mime/tika-mimetypes.xml | 7 ++
.../org/apache/tika/module/office/BundleIT.java | 2 +-
.../parser/microsoft/MSOwnerFileParser.java | 81 +++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 1 +
.../parser/microsoft/MSOwnerFileParserTest.java | 31 +++++++
.../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes
8 files changed, 128 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1d8f2cc..81243fe 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ Release 2.0 - Future Development
Release 1.14 - ???
+ * Add mime definition and parser for MS Owner File (TIKA-2008).
+
* Add mime definition for Windows Media Metafile (TIKA-2004).
* Add mime definitions of iCal and vCalendar (TIKA-2006).
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index eed11e8..d27c714 100644
--- a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -981,6 +981,11 @@ public class TestMimeTypes extends TikaTest {
assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
}
+ @Test
+ public void testMSOwner() throws Exception {
+ assertType("application/x-ms-owner", "testMSOwnerFile");
+ }
+
private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index c513361..9ec8d76 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3403,6 +3403,13 @@
<mime-type type="application/x-ms-application">
<glob pattern="*.application"/>
</mime-type>
+ <mime-type type="application/x-ms-owner">
+ <_comment>Temporary files created by MSOffice applications</_comment>
+ <_comment>PRONOM fmt-473</_comment>
+ <magic priority="80">
+ <match value="(?s)^([\\x05-\\x0F]).{53}\\1\x00" type="regex" offset="0"/>
+ </magic>
+ </mime-type>
<mime-type type="application/x-ms-wmd">
<glob pattern="*.wmd"/>
</mime-type>
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
index 3f564fe..6336ddf 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
+++ b/tika-parser-bundles/tika-parser-office-bundle/src/test/java/org/apache/tika/module/office/BundleIT.java
@@ -80,6 +80,6 @@ public class BundleIT {
@Test
public void testServicesCreated() throws Exception {
ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null);
- assertEquals("Not all Services have started", 24, services.length);
+ assertEquals("Not all Services have started", 25, services.length);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
new file mode 100644
index 0000000..02c07a6
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for temporary MSOFfice files.
+ * This currently only extracts the owner's name.
+ */
+public class MSOwnerFileParser extends AbstractParser {
+
+ private static final int ASCII_CHUNK_LENGTH = 54;
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -752276948656079347L;
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MEDIA_TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts owner from MS temp file
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
+ IOUtils.readFully(stream, asciiNameBytes);
+ int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
+ String asciiName = new String(asciiNameBytes, 1, asciiNameLength, StandardCharsets.US_ASCII);
+ metadata.set(TikaCoreProperties.CREATOR, asciiName);
+
+ int unicodeCharLength = stream.read();
+ if (unicodeCharLength > 0) {
+ stream.read();//zero after the char length
+ byte[] unicodeBytes = new byte[unicodeCharLength * 2];
+ IOUtils.readFully(stream, unicodeBytes);
+ String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
+ metadata.set(TikaCoreProperties.CREATOR, unicodeName);
+ }
+ xhtml.endDocument();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 4d3290e..1c8cee1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -19,6 +19,7 @@ org.apache.tika.parser.microsoft.JackcessParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
org.apache.tika.parser.microsoft.xml.WordMLParser
org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
new file mode 100644
index 0000000..3cef3df
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/MSOwnerFileParserTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class MSOwnerFileParserTest extends TikaTest {
+ @Test
+ public void testBasic() throws Exception {
+ XMLResult r = getXML("testMSOwnerFile");
+ assertEquals("heidi", r.metadata.get(TikaCoreProperties.CREATOR));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/60d4e3ff/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile
new file mode 100644
index 0000000..72a5f57
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testMSOwnerFile differ