You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/26 11:04:55 UTC

[tika] branch branch_1x updated (85f21a6a6 -> f218c99bc)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 85f21a6a6 TIKA-3729 -- upgrade metadata-extractor
     new 434c73680 TIKA-3733 -- fix newly discovered npe in WordExtractor and OutlookExtractor
     new f218c99bc TIKA-3734 -- fix new junrar IllegalArgumentException on streams of zero length.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../parser/microsoft/AbstractPOIFSExtractor.java     |  4 ----
 .../apache/tika/parser/microsoft/OfficeParser.java   |  2 +-
 .../tika/parser/microsoft/OutlookExtractor.java      |  9 +++++----
 .../apache/tika/parser/microsoft/WordExtractor.java  |  7 ++-----
 .../java/org/apache/tika/parser/pkg/RarParser.java   | 20 ++++++++++----------
 5 files changed, 18 insertions(+), 24 deletions(-)


[tika] 01/02: TIKA-3733 -- fix newly discovered npe in WordExtractor and OutlookExtractor

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 434c73680600c7faa8f6fb812d2156c3462d3982
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 06:58:36 2022 -0400

    TIKA-3733 -- fix newly discovered npe in WordExtractor and OutlookExtractor
---
 .../org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java | 4 ----
 .../main/java/org/apache/tika/parser/microsoft/OfficeParser.java | 2 +-
 .../java/org/apache/tika/parser/microsoft/OutlookExtractor.java  | 9 +++++----
 .../java/org/apache/tika/parser/microsoft/WordExtractor.java     | 7 ++-----
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 3f9724977..d6bdca9c9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -52,10 +52,6 @@ abstract class AbstractPOIFSExtractor {
     protected final OfficeParserConfig officeParserConfig;
     protected final ParseContext context;
 
-    protected AbstractPOIFSExtractor(ParseContext context) {
-        this(context, null);
-    }
-
     protected AbstractPOIFSExtractor(ParseContext context, Metadata parentMetadata) {
         embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 67b09bb75..226c6c599 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -194,7 +194,7 @@ public class OfficeParser extends AbstractOfficeParser {
                 break;
             case OUTLOOK:
                 OutlookExtractor extractor =
-                        new OutlookExtractor(root, context);
+                        new OutlookExtractor(root, metadata, context);
 
                 extractor.parse(xhtml, metadata);
                 break;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 33b7fbf14..d440f1356 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -126,12 +126,13 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
 
     private final boolean extractAllAlternatives;
 
-    public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException {
-        this(filesystem.getRoot(), context);
+    public OutlookExtractor(POIFSFileSystem filesystem,
+                            Metadata parentMetadata, ParseContext context) throws TikaException {
+        this(filesystem.getRoot(), parentMetadata, context);
     }
 
-    public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
-        super(context);
+    public OutlookExtractor(DirectoryNode root, Metadata parentMetadata, ParseContext context) throws TikaException {
+        super(context, parentMetadata);
         this.parseContext = context;
         this.extractAllAlternatives = context.get(OfficeParserConfig.class).getExtractAllAlternativesFromMSG();
         try {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 6b3d08c8b..2f3f267ef 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -83,11 +83,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
 
     private final Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();
 
-    private final Metadata metadata;
-
     public WordExtractor(ParseContext context, Metadata metadata) {
-        super(context);
-        this.metadata = metadata;
+        super(context, metadata);
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -239,7 +236,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             return;
         }
         for (SavedByEntry sbe : savedByTable.getEntries()) {
-            metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+            parentMetadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
         }
     }
 


[tika] 02/02: TIKA-3734 -- fix new junrar IllegalArgumentException on streams of zero length.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f218c99bc41df0aa6d420595ba27cf3418298040
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 07:04:45 2022 -0400

    TIKA-3734 -- fix new junrar IllegalArgumentException on streams of zero length.
---
 .../java/org/apache/tika/parser/pkg/RarParser.java   | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index bb1283933..407f3c434 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -83,16 +83,16 @@ public class RarParser extends AbstractParser {
             FileHeader header = rar.nextFileHeader();
             while (header != null && !Thread.currentThread().isInterrupted()) {
                 if (!header.isDirectory()) {
-                    try (InputStream subFile = rar.getInputStream(header)) {
-                        Metadata entrydata = PackageParser.handleEntryMetadata(
-                                "".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(),
-                                header.getCTime(), header.getMTime(),
-                                header.getFullUnpackSize(),
-                                xhtml
-                        );
-
-                        if (extractor.shouldParseEmbedded(entrydata)) {
-                            extractor.parseEmbedded(subFile, handler, entrydata, true);
+                    if (header.getFullUnpackSize() > 0) {
+                        try (InputStream subFile = rar.getInputStream(header)) {
+                            Metadata entrydata = PackageParser.handleEntryMetadata(
+                                    "".equals(header.getFileNameW()) ? header.getFileNameString() :
+                                            header.getFileNameW(), header.getCTime(), header.getMTime(),
+                                    header.getFullUnpackSize(), xhtml);
+
+                            if (extractor.shouldParseEmbedded(entrydata)) {
+                                extractor.parseEmbedded(subFile, handler, entrydata, true);
+                            }
                         }
                     }
                 }