You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/03/15 19:52:10 UTC
[tika] branch main updated: TIKA-3987 (#1017)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2911829f3 TIKA-3987 (#1017)
2911829f3 is described below
commit 2911829f30b2ad2b6631837739d98cd959fc14fa
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed Mar 15 15:52:03 2023 -0400
TIKA-3987 (#1017)
* TIKA-3987 -- add a parser for ActiveMime files
---
CHANGES.txt | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 8 ++-
.../microsoft/activemime/ActiveMimeParser.java | 83 ++++++++++++++++++++++
.../microsoft/onenote/OneNotePropertyEnum.java | 1 +
.../services/org.apache.tika.parser.Parser | 1 +
.../microsoft/activemime/ActiveMimeParserTest.java | 47 ++++++++++++
6 files changed, 141 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 87671965c..818e935c3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.7.1 - ???
+ * Add detection and a parser for ActiveMime files (TIKA-3987).
+
* Users may now avoid the ZeroByteFileException via a
setting on the AutoDetectParserConfig (TIKA-3976).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ea735de55..cb1b5d48c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3181,7 +3181,13 @@
<mime-type type="application/x-ace-compressed">
<glob pattern="*.ace"/>
</mime-type>
-
+ <mime-type type="application/x-activemime">
+ <magic priority="60">
+ <!-- ActiveMime\u0000\u0000 -->
+ <match value="0x4163746976654D696D650000" type="string" offset="0">
+ </match>
+ </magic>
+ </mime-type>
<mime-type type="application/x-axcrypt">
<_comment>AxCrypt</_comment>
<glob pattern="*.axx" />
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
new file mode 100644
index 000000000..9e2c94d05
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * ActiveMime is a macro container format used in some mso files. See, e.g.
+ * <a href="https://mastodon.social/@Ange/110027138524274526">Ange's toot</a>.
+ */
+public class ActiveMimeParser extends AbstractParser {
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-activemime");
+ private static final Set<MediaType> SUPPORTED = Collections.singleton(MEDIA_TYPE);
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ //based on: https://mastodon.social/@Ange/110027138524274526
+ IOUtils.skipFully(stream, 12); //header
+ IOUtils.skipFully(stream, 2); //version
+ IOUtils.skipFully(stream, 4); //flag1 04000000
+ IOUtils.skipFully(stream, 4);//reserved ffffffff
+ IOUtils.skipFully(stream, 4);//flag2 000006F0
+ // do something with this? If so, use readUInt
+ // long datasize = LittleEndian.readUInt(stream);
+ IOUtils.skipFully(stream, 4); //datasize
+ long zlibOffset = LittleEndian.readUInt(stream);
+ IOUtils.skipFully(stream, 4);//flag
+ IOUtils.skipFully(stream, 4);//uncompressed size
+ IOUtils.skipFully(stream, 4);//don't know
+
+ IOUtils.skipFully(stream, zlibOffset);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ try (InputStream payload = new DeflateCompressorInputStream(stream)) {
+ try (POIFSFileSystem poifs = new POIFSFileSystem(payload)) {
+ OfficeParser.extractMacros(poifs, xhtml, ex);
+ }
+ }
+ xhtml.endDocument();
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
index f676b5567..8f67b416c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
@@ -76,6 +76,7 @@ public enum OneNotePropertyEnum {
WebPictureContainer14(0x200034C8), ImageUploadState(0x140034CB), TextExtendedAscii(0x1C003498),
PictureWidth(0x140034CD), PictureHeight(0x140034CE), PageMarginOriginX(0x14001D0F),
PageMarginOriginY(0x14001D10), WzHyperlinkUrl(0x1C001E20), TaskTagDueDate(0x1400346B),
+ NotebookElementOrderingID(0x14001CB9),
Unknown(0x00000000);
private static final Map<Long, OneNotePropertyEnum> BY_ID = new HashMap<>();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a8c58a28c..621530848 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -20,6 +20,7 @@ org.apache.tika.parser.microsoft.MSOwnerFileParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.activemime.ActiveMimeParser
org.apache.tika.parser.microsoft.onenote.OneNoteParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
new file mode 100644
index 000000000..c953eeffe
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class ActiveMimeParserTest extends TikaTest {
+
+
+ @Test
+ @Disabled("until we get permission to use the file")
+ public void testBasic() throws Exception {
+ //file used in testing is here: https://telparia.com/fileFormatSamples/archive/activeMime/editdata.mso
+ //if we get permission to add it to our repo, these should work
+ Path p = Paths.get(".../editdata.mso");
+ List<Metadata> metadataList = getRecursiveMetadata(p);
+ assertEquals("application/x-activemime", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+ assertEquals(5, metadataList.size());
+ assertContains("Arquivo Gerado com sucesso!!!",
+ metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
+ }
+}