You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/26 12:24:23 UTC

[tika] branch main updated (90c7e4c2d -> 079db8d82)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 90c7e4c2d TIKA-3730 -- fix checkstyle; hang head in shame.
     new 10a87151b general upgrades for 2.x
     new 15909b432 TIKA-3733 -- pass parent metadata to AbstractPOIFSExtractor in OutlookExtractor
     new 43d0434b8 TIKA-3734 -- avoid illegalargumentexception with zero byte streams
     new 079db8d82 TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 tika-parent/pom.xml                                |  16 ++++----
 .../java/org/apache/tika/parser/dwg/DWGParser.java |  10 +++--
 .../org/apache/tika/parser/dwg/DWGParserTest.java  |  25 ++++++++++--
 .../resources/test-documents/testDWG-AC1027.dwg    | Bin 0 -> 265260 bytes
 .../resources/test-documents/testDWG-AC1032.dwg    | Bin 0 -> 158593 bytes
 .../apache/tika/parser/microsoft/OfficeParser.java |   4 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |  42 +++++++++++++++++++++
 .../java/org/apache/tika/parser/pkg/RarParser.java |  21 +++++++----
 tika-pipes/pom.xml                                 |   4 +-
 10 files changed, 97 insertions(+), 27 deletions(-)
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg


[tika] 02/04: TIKA-3733 -- pass parent metadata to AbstractPOIFSExtractor in OutlookExtractor

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 15909b43253ed671f76a3aed2205da3b67dbb97d
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 07:19:47 2022 -0400

    TIKA-3733 -- pass parent metadata to AbstractPOIFSExtractor in OutlookExtractor
---
 .../apache/tika/parser/microsoft/OfficeParser.java |  4 +--
 .../tika/parser/microsoft/OutlookExtractor.java    | 42 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index a155e470c..280b486bd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -235,9 +235,9 @@ public class OfficeParser extends AbstractOfficeParser {
                 }
                 break;
             case OUTLOOK:
-                OutlookExtractor extractor = new OutlookExtractor(root, context);
+                OutlookExtractor extractor = new OutlookExtractor(root, metadata, context);
 
-                extractor.parse(xhtml, metadata);
+                extractor.parse(xhtml);
                 break;
             case ENCRYPTED:
                 EncryptionInfo info = new EncryptionInfo(root);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a7983d7b4..2f19dec1a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -96,10 +96,27 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
     private final ParseContext parseContext;
     private final boolean extractAllAlternatives;
     HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
+    /**
+     * @deprecated use {@link OutlookExtractor#OutlookExtractor(DirectoryNode, Metadata, ParseContext)}
+     *      Will be removed after 2.4.0
+     * @param filesystem
+     * @param context
+     * @throws TikaException
+     */
+    @Deprecated
     public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException {
         this(filesystem.getRoot(), context);
     }
 
+    /**
+     * @deprecated use {@link OutlookExtractor#OutlookExtractor(DirectoryNode, Metadata, ParseContext)}
+     *              Will be removed after 2.4.0
+     * @param root
+     * @param context
+     * @throws TikaException
+     */
+    @Deprecated
     public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
         super(context);
         this.parseContext = context;
@@ -112,6 +129,18 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    public OutlookExtractor(DirectoryNode root, Metadata metadata, ParseContext context) throws TikaException {
+        super(context, metadata);
+        this.parseContext = context;
+        this.extractAllAlternatives =
+                context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG();
+        try {
+            this.msg = new MAPIMessage(root);
+        } catch (IOException e) {
+            throw new TikaException("Failed to parse Outlook message", e);
+        }
+    }
+
     //need to add empty string to ensure that parallel arrays are parallel
     //even if one value is null.
     public static void addEvenIfNull(Property property, String value, Metadata metadata) {
@@ -156,6 +185,19 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    public void parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
+        parse(xhtml, parentMetadata);
+    }
+
+    /**
+     * @deprecated use {@link #parse(XHTMLContentHandler), will be removed after 2.4.0}
+     * @param xhtml
+     * @param metadata
+     * @throws TikaException
+     * @throws SAXException
+     * @throws IOException
+     */
+    @Deprecated
     public void parse(XHTMLContentHandler xhtml, Metadata metadata)
             throws TikaException, SAXException, IOException {
         try {


[tika] 01/04: general upgrades for 2.x

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 10a87151b15a52a1d7d792e9ccc0cf546d30ebc0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 07:12:18 2022 -0400

    general upgrades for 2.x
---
 tika-parent/pom.xml | 16 ++++++++--------
 tika-pipes/pom.xml  |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 1a3859228..465a1accb 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -288,7 +288,7 @@
     <rat.version>0.13</rat.version>
 
     <!-- dependency versions -->
-    <aws.version>1.12.200</aws.version>
+    <aws.version>1.12.206</aws.version>
     <asm.version>9.3</asm.version>
     <boilerpipe.version>1.1.0</boilerpipe.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
@@ -306,16 +306,16 @@
     <commons.logging.version>1.2</commons.logging.version>
     <commons.math3.version>3.6.1</commons.math3.version>
     <ctakes.version>4.0.0.1</ctakes.version>
-    <cxf.version>3.5.1</cxf.version>
+    <cxf.version>3.5.2</cxf.version>
     <ddplist.version>1.23</ddplist.version>
     <dl4j.version>1.0.0-M2</dl4j.version>
     <!-- fakeload versions &gt; 0.4.0 require java > 8 -->
     <fakeload.version>0.4.0</fakeload.version>
     <geoapi.version>3.0.1</geoapi.version>
-    <google.cloud.version>2.6.0</google.cloud.version>
+    <google.cloud.version>2.6.1</google.cloud.version>
     <gson.version>2.9.0</gson.version>
     <guava.version>31.1-jre</guava.version>
-    <h2.version>2.1.210</h2.version>
+    <h2.version>2.1.212</h2.version>
     <httpcomponents.version>4.5.13</httpcomponents.version>
     <httpcore.version>4.4.15</httpcore.version>
     <imageio.version>1.4.0</imageio.version>
@@ -331,7 +331,7 @@
     <jdom2.version>2.0.6.1</jdom2.version>
     <jempbox.version>1.8.16</jempbox.version>
     <jetty.version>9.4.46.v20220331</jetty.version>
-    <jhighlight.version>1.0.3</jhighlight.version>
+    <jhighlight.version>1.1.0</jhighlight.version>
     <jna.version>5.11.0</jna.version>
     <joda.time.version>2.10.10</joda.time.version>
     <json.simple.version>1.1.1</json.simple.version>
@@ -348,7 +348,7 @@
     <microsoft.translator.version>0.6.2</microsoft.translator.version>
     <!-- 0.8.6 is built with java 11 and does not work with Java 8 -->
     <mime4j.version>0.8.4</mime4j.version>
-    <mockito.version>4.4.0</mockito.version>
+    <mockito.version>4.5.1</mockito.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <opencsv.version>2.3</opencsv.version>
     <objenesis.version>3.2</objenesis.version>
@@ -371,10 +371,10 @@
     <!-- we'll need to stay on 1.7 until we're java modularized ? -->
     <slf4j.version>1.7.36</slf4j.version>
     <solrj.version>8.11.1</solrj.version>
-    <spring.version>5.3.18</spring.version>
+    <spring.version>5.3.19</spring.version>
     <sqlite.version>3.36.0.3</sqlite.version>
     <tagsoup.version>1.2.1</tagsoup.version>
-    <test.containers.version>1.16.3</test.containers.version>
+    <test.containers.version>1.17.1</test.containers.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
     <tukaani.version>1.9</tukaani.version>
     <twelvemonkeys.version>3.8.2</twelvemonkeys.version>
diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml
index 81cb97513..b52ec4bec 100644
--- a/tika-pipes/pom.xml
+++ b/tika-pipes/pom.xml
@@ -30,7 +30,7 @@
   <packaging>pom</packaging>
 
   <properties>
-    <netty.version>4.1.75.Final</netty.version>
+    <netty.version>4.1.76.Final</netty.version>
   </properties>
 
   <modules>
@@ -100,7 +100,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-tcnative-classes</artifactId>
-        <version>2.0.47.Final</version>
+        <version>2.0.51.Final</version>
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>


[tika] 03/04: TIKA-3734 -- avoid illegalargumentexception with zero byte streams

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 43d0434b82600458d053772caf6dedb22ece7013
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 07:21:18 2022 -0400

    TIKA-3734 -- avoid illegalargumentexception with zero byte streams
---
 .../java/org/apache/tika/parser/pkg/RarParser.java  | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 5d0d72608..358fa59b4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.pkg;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collections;
@@ -83,14 +84,20 @@ public class RarParser extends AbstractParser {
             FileHeader header = rar.nextFileHeader();
             while (header != null && !Thread.currentThread().isInterrupted()) {
                 if (!header.isDirectory()) {
-                    try (InputStream subFile = rar.getInputStream(header)) {
-                        Metadata entrydata = PackageParser.handleEntryMetadata(
-                                "".equals(header.getFileNameW()) ? header.getFileNameString() :
-                                        header.getFileNameW(), header.getCTime(), header.getMTime(),
-                                header.getFullUnpackSize(), xhtml);
-
+                    Metadata entrydata = PackageParser.handleEntryMetadata(
+                            "".equals(header.getFileNameW()) ? header.getFileNameString() :
+                                    header.getFileNameW(), header.getCTime(), header.getMTime(),
+                            header.getFullUnpackSize(), xhtml);
+                    if (header.getFullUnpackSize() > 0) {
+                        try (InputStream subFile = rar.getInputStream(header)) {
+                            if (extractor.shouldParseEmbedded(entrydata)) {
+                                extractor.parseEmbedded(subFile, handler, entrydata, true);
+                            }
+                        }
+                    } else {
                         if (extractor.shouldParseEmbedded(entrydata)) {
-                            extractor.parseEmbedded(subFile, handler, entrydata, true);
+                            extractor.parseEmbedded(new ByteArrayInputStream(new byte[0]), handler,
+                                    entrydata, true);
                         }
                     }
                 }


[tika] 04/04: TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 079db8d8286d681dd05568b532e11fbf02f23fd0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 08:24:05 2022 -0400

    TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata
---
 CHANGES.txt                                        |   2 ++
 .../java/org/apache/tika/parser/dwg/DWGParser.java |  10 +++++----
 .../org/apache/tika/parser/dwg/DWGParserTest.java  |  25 +++++++++++++++++----
 .../resources/test-documents/testDWG-AC1027.dwg    | Bin 0 -> 265260 bytes
 .../resources/test-documents/testDWG-AC1032.dwg    | Bin 0 -> 158593 bytes
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index d78cc9351..547383cca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,8 @@ Release 2.4.0 - ???
      https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
      for the dependencies that must be provided at run-time (TIKA-3676).
 
+   * NOTE: Added prefix "dwg-custom:" to DWG custom metadata properties (TIKA-3731).
+
    * Add initial, BETA-grade TLS encryption option for tika-server;
      configuration may change in future releases (TIKA-3719).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 385418c7e..4519623fc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -43,7 +43,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
  * lots of the low level string/int/short concepts are the same.
  */
 public class DWGParser extends AbstractParser {
-
+    public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
     /**
      * Serial version UID
      */
@@ -115,6 +115,8 @@ public class DWGParser extends AbstractParser {
                     get2004Props(stream, metadata, xhtml);
                 }
                 break;
+            case "AC1027":
+            case "AC1032":
             case "AC1021":
             case "AC1024":
                 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
@@ -146,7 +148,7 @@ public class DWGParser extends AbstractParser {
             String propName = read2004String(stream);
             String propValue = read2004String(stream);
             if (propName.length() > 0 && propValue.length() > 0) {
-                metadata.add(propName, propValue);
+                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
             }
         }
     }
@@ -182,7 +184,7 @@ public class DWGParser extends AbstractParser {
             String propName = read2007and2010String(stream);
             String propValue = read2007and2010String(stream);
             if (propName.length() > 0 && propValue.length() > 0) {
-                metadata.add(propName, propValue);
+                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
             }
         }
     }
@@ -233,7 +235,7 @@ public class DWGParser extends AbstractParser {
                     if (splitAt > -1) {
                         String propName = val.substring(0, splitAt);
                         String propVal = val.substring(splitAt + 1);
-                        metadata.add(propName, propVal);
+                        metadata.add(DWGParser.DWG_CUSTOM_META_PREFIX + propName, propVal);
                     }
                 }
             } else {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index f28019f0b..88807b087 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -26,11 +26,12 @@ import java.util.Arrays;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.BodyContentHandler;
 
-public class DWGParserTest {
+public class DWGParserTest extends TikaTest {
 
     @Test
     public void testDWG2000Parser() throws Exception {
@@ -81,8 +82,10 @@ public class DWGParserTest {
             ContentHandler handler = new BodyContentHandler();
             new DWGParser().parse(input, handler, metadata, null);
 
-            assertEquals("valueforcustomprop1", metadata.get("customprop1"));
-            assertEquals("valueforcustomprop2", metadata.get("customprop2"));
+            assertEquals("valueforcustomprop1",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop1"));
+            assertEquals("valueforcustomprop2",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop2"));
         }
     }
 
@@ -165,7 +168,8 @@ public class DWGParserTest {
             assertEquals("This is a comment", metadata.get(TikaCoreProperties.COMMENTS));
             assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
             assertEquals("http://mycompany/drawings", metadata.get(TikaCoreProperties.RELATION));
-            assertEquals("MyCustomPropertyValue", metadata.get("MyCustomProperty"));
+            assertEquals("MyCustomPropertyValue",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "MyCustomProperty"));
 
             String content = handler.toString();
             assertContains("This is a comment", content);
@@ -174,4 +178,17 @@ public class DWGParserTest {
             input.close();
         }
     }
+
+    @Test
+    public void testAC1027() throws Exception {
+        Metadata metadata = getXML("testDWG-AC1027.dwg").metadata;
+        assertEquals("hlu", metadata.get(TikaCoreProperties.MODIFIER));
+    }
+
+    @Test
+    public void testAC1032() throws Exception {
+        Metadata metadata = getXML("testDWG-AC1032.dwg").metadata;
+        assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S ADDRESS"));
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg
new file mode 100644
index 000000000..9409e3fec
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg
new file mode 100644
index 000000000..5e644bf21
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg differ