You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/03/15 19:20:04 UTC

[tika] branch TIKA-3987 created (now c3a44726f)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3987
in repository https://gitbox.apache.org/repos/asf/tika.git


      at c3a44726f TIKA-3987 -- add a parser for ActiveMime files

This branch includes the following new commits:

     new c3a44726f TIKA-3987 -- add a parser for ActiveMime files

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3987 -- add a parser for ActiveMime files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3987
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c3a44726fb131e894835582faab82a8a8ca1509f
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 15 15:19:40 2023 -0400

    TIKA-3987 -- add a parser for ActiveMime files
---
 CHANGES.txt                                        |  2 +
 .../org/apache/tika/mime/tika-mimetypes.xml        |  8 ++-
 .../microsoft/activemime/ActiveMimeParser.java     | 82 ++++++++++++++++++++++
 .../microsoft/onenote/OneNotePropertyEnum.java     |  1 +
 .../services/org.apache.tika.parser.Parser         |  1 +
 .../microsoft/activemime/ActiveMimeParserTest.java | 47 +++++++++++++
 6 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 87671965c..818e935c3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.7.1 - ???
 
+   * Add detection and a parser for ActiveMime files (TIKA-3987).
+
    * Users may now avoid the ZeroByteFileException via a
      setting on the AutoDetectParserConfig (TIKA-3976).
 
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ea735de55..cb1b5d48c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3181,7 +3181,13 @@
   <mime-type type="application/x-ace-compressed">
     <glob pattern="*.ace"/>
   </mime-type>
-
+  <mime-type type="application/x-activemime">
+    <magic priority="60">
+      <!-- ActiveMime\u0000\u0000 -->
+      <match value="0x4163746976654D696D650000" type="string" offset="0">
+    </match>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-axcrypt">
     <_comment>AxCrypt</_comment>
     <glob pattern="*.axx" />
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
new file mode 100644
index 000000000..850be0f64
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParser.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * ActiveMime is a macro container format used in some mso files.  See, e.g.
+ * <a href="https://mastodon.social/@Ange/110027138524274526">Ange's toot</a>.
+ */
+public class ActiveMimeParser extends AbstractParser {
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-activemime");
+    private static final Set<MediaType> SUPPORTED = Collections.singleton(MEDIA_TYPE);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+        //based on: https://mastodon.social/@Ange/110027138524274526
+        IOUtils.skipFully(stream, 12); //header
+        IOUtils.skipFully(stream, 2); //version
+        IOUtils.skipFully(stream, 4); //flag1 040000
+        IOUtils.skipFully(stream, 4);//reserved ffffff
+        IOUtils.skipFully(stream, 4);//flag2 0000
+        long datasize = LittleEndian.readUInt(stream);//datasize
+        long zlibOffset = LittleEndian.readUInt(stream);
+        IOUtils.skipFully(stream, 4);//flag
+        IOUtils.skipFully(stream, 4);//uncompressed size
+        IOUtils.skipFully(stream, 4);//don't know
+
+        IOUtils.skipFully(stream, zlibOffset);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        try (InputStream payload = new DeflateCompressorInputStream(stream)) {
+            try (POIFSFileSystem poifs = new POIFSFileSystem(payload)) {
+                OfficeParser.extractMacros(poifs, xhtml, ex);
+            }
+        }
+        xhtml.endDocument();
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
index f676b5567..8f67b416c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
@@ -76,6 +76,7 @@ public enum OneNotePropertyEnum {
     WebPictureContainer14(0x200034C8), ImageUploadState(0x140034CB), TextExtendedAscii(0x1C003498),
     PictureWidth(0x140034CD), PictureHeight(0x140034CE), PageMarginOriginX(0x14001D0F),
     PageMarginOriginY(0x14001D10), WzHyperlinkUrl(0x1C001E20), TaskTagDueDate(0x1400346B),
+    NotebookElementOrderingID(0x14001CB9),
     Unknown(0x00000000);
 
     private static final Map<Long, OneNotePropertyEnum> BY_ID = new HashMap<>();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a8c58a28c..621530848 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -20,6 +20,7 @@ org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.activemime.ActiveMimeParser
 org.apache.tika.parser.microsoft.onenote.OneNoteParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
 org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
new file mode 100644
index 000000000..c953eeffe
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/activemime/ActiveMimeParserTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.activemime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class ActiveMimeParserTest extends TikaTest {
+
+
+    @Test
+    @Disabled("until we get permission to use the file")
+    public void testBasic() throws Exception {
+        //file used in testing is here: https://telparia.com/fileFormatSamples/archive/activeMime/editdata.mso
+        //if we get permission to add it to our repo, these should work
+        Path p = Paths.get(".../editdata.mso");
+        List<Metadata> metadataList = getRecursiveMetadata(p);
+        assertEquals("application/x-activemime", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals(5, metadataList.size());
+        assertContains("Arquivo Gerado com sucesso!!!",
+                metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+}