You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/03/15 19:52:10 UTC

[tika] branch main updated: TIKA-3987 (#1017)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2911829f3 TIKA-3987 (#1017)
2911829f3 is described below

commit 2911829f30b2ad2b6631837739d98cd959fc14fa
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed Mar 15 15:52:03 2023 -0400

    TIKA-3987 (#1017)
    
    * TIKA-3987 -- add a parser for ActiveMime files
---
 CHANGES.txt                                        |  2 +
 .../org/apache/tika/mime/tika-mimetypes.xml        |  8 ++-
 .../microsoft/activemime/ActiveMimeParser.java     | 83 ++++++++++++++++++++++
 .../microsoft/onenote/OneNotePropertyEnum.java     |  1 +
 .../services/org.apache.tika.parser.Parser         |  1 +
 .../microsoft/activemime/ActiveMimeParserTest.java | 47 ++++++++++++
 6 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 87671965c..818e935c3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.7.1 - ???
 
+   * Add detection and a parser for ActiveMime files (TIKA-3987).
+
    * Users may now avoid the ZeroByteFileException via a
      setting on the AutoDetectParserConfig (TIKA-3976).
 
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ea735de55..cb1b5d48c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3181,7 +3181,13 @@
   <mime-type type="application/x-ace-compressed">
     <glob pattern="*.ace"/>
   </mime-type>
-
+  <mime-type type="application/x-activemime">
+    <magic priority="60">
+      <!-- ActiveMime\u0000\u0000 -->
+      <match value="0x4163746976654D696D650000" type="string" offset="0">
+    </match>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-axcrypt">
     <_comment>AxCrypt</_comment>
     <glob pattern="*.axx" />
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
new file mode 100644
index 000000000..9e2c94d05
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * ActiveMime is a macro container format used in some mso files.  See, e.g.
+ * <a href="https://mastodon.social/@Ange/110027138524274526">Ange's toot</a>.
+ */
+public class ActiveMimeParser extends AbstractParser {
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-activemime");
+    private static final Set<MediaType> SUPPORTED = Collections.singleton(MEDIA_TYPE);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+        //based on: https://mastodon.social/@Ange/110027138524274526
+        IOUtils.skipFully(stream, 12); //header
+        IOUtils.skipFully(stream, 2); //version
+        IOUtils.skipFully(stream, 4); //flag1 04000000
+        IOUtils.skipFully(stream, 4);//reserved ffffffff
+        IOUtils.skipFully(stream, 4);//flag2 000006F0
+        // do something with this?  If so, use readUInt
+        // long datasize = LittleEndian.readUInt(stream);
+        IOUtils.skipFully(stream, 4); //datasize
+        long zlibOffset = LittleEndian.readUInt(stream);
+        IOUtils.skipFully(stream, 4);//flag
+        IOUtils.skipFully(stream, 4);//uncompressed size
+        IOUtils.skipFully(stream, 4);//don't know
+
+        IOUtils.skipFully(stream, zlibOffset);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        try (InputStream payload = new DeflateCompressorInputStream(stream)) {
+            try (POIFSFileSystem poifs = new POIFSFileSystem(payload)) {
+                OfficeParser.extractMacros(poifs, xhtml, ex);
+            }
+        }
+        xhtml.endDocument();
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
index f676b5567..8f67b416c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
@@ -76,6 +76,7 @@ public enum OneNotePropertyEnum {
     WebPictureContainer14(0x200034C8), ImageUploadState(0x140034CB), TextExtendedAscii(0x1C003498),
     PictureWidth(0x140034CD), PictureHeight(0x140034CE), PageMarginOriginX(0x14001D0F),
     PageMarginOriginY(0x14001D10), WzHyperlinkUrl(0x1C001E20), TaskTagDueDate(0x1400346B),
+    NotebookElementOrderingID(0x14001CB9),
     Unknown(0x00000000);
 
     private static final Map<Long, OneNotePropertyEnum> BY_ID = new HashMap<>();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a8c58a28c..621530848 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -20,6 +20,7 @@ org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.activemime.ActiveMimeParser
 org.apache.tika.parser.microsoft.onenote.OneNoteParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
 org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
new file mode 100644
index 000000000..c953eeffe
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class ActiveMimeParserTest extends TikaTest {
+
+
+    @Test
+    @Disabled("until we get permission to use the file")
+    public void testBasic() throws Exception {
+        //file used in testing is here: https://telparia.com/fileFormatSamples/archive/activeMime/editdata.mso
+        //if we get permission to add it to our repo, these should work
+        Path p = Paths.get(".../editdata.mso");
+        List<Metadata> metadataList = getRecursiveMetadata(p);
+        assertEquals("application/x-activemime", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals(5, metadataList.size());
+        assertContains("Arquivo Gerado com sucesso!!!",
+                metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+}