You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/14 15:41:31 UTC

[tika] branch main updated (ef9f442 -> 9b8644b)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from ef9f442  TIKA-3476 -- turn off tags reports in default comparison and profile reports.
     new 070222a  TIKA-3477 -- don't close embedded word doc because that in turn closes the root.  Unrelated issue, avoid NPE if ooxml part doesn't exist.
     new 9b8644b  TIKA-3478 -- handle "desc" field

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/tika/parser/mp4/boxes/TikaUserDataBox.java |  4 +++-
 .../apache/tika/parser/microsoft/WordExtractor.java   | 19 ++++++++++---------
 .../microsoft/ooxml/AbstractOOXMLExtractor.java       |  3 +++
 3 files changed, 16 insertions(+), 10 deletions(-)

[tika] 01/02: TIKA-3477 -- don't close embedded word doc because that in turn closes the root. Unrelated issue, avoid NPE if ooxml part doesn't exist.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 070222a6a968aca6aff6fdf438379dfe4723e7e0
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 14 10:19:25 2021 -0400

    TIKA-3477 -- don't close embedded word doc because that in turn closes the root.  Unrelated issue, avoid NPE if ooxml part doesn't exist.
---
 .../apache/tika/parser/microsoft/WordExtractor.java   | 19 ++++++++++---------
 .../microsoft/ooxml/AbstractOOXMLExtractor.java       |  3 +++
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index f8ff5e1..7816073 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -85,11 +85,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
 
     private final Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();
 
-    private final Metadata metadata;
-
     public WordExtractor(ParseContext context, Metadata metadata) {
-        super(context);
-        this.metadata = metadata;
+        super(context, metadata);
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -218,14 +215,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             p = pictures.nextUnclaimed();
         }
 
-        // Handle any embeded office documents
+        // Handle any embedded office documents
         try {
+
             DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
             for (Entry entry : op) {
                 if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                     handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
                 }
             }
+
         } catch (FileNotFoundException e) {
             //swallow
         }
@@ -237,7 +236,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             return;
         }
         for (SavedByEntry sbe : savedByTable.getEntries()) {
-            metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+            parentMetadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
         }
     }
 
@@ -596,9 +595,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
         Word6Extractor extractor;
-        try (HWPFOldDocument doc = new HWPFOldDocument(root)) {
-            extractor = new Word6Extractor(doc);
-        }
+        //DO NOT put this in a try/autoclose.  This will close the root
+        //which we don't want to do because there may be other
+        //documents in the root.
+        HWPFOldDocument doc = new HWPFOldDocument(root);
+        extractor = new Word6Extractor(doc);
 
         for (String p : extractor.getParagraphText()) {
             xhtml.element("p", p);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd6c6e5..f23ae12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -167,6 +167,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             for (PackageRelationship rel : opcPackage
                     .getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
                 PackagePart tPart = opcPackage.getPart(rel);
+                if (tPart == null) {
+                    continue;
+                }
                 InputStream tStream = tPart.getInputStream();
                 Metadata thumbnailMetadata = new Metadata();
                 String thumbName = tPart.getPartName().getName();

[tika] 02/02: TIKA-3478 -- handle "desc" field

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9b8644b54d94fcc54142d40be2bb6abfca08fa6e
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 14 10:51:50 2021 -0400

    TIKA-3478 -- handle "desc" field
---
 .../main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
index 778a656..fcde0fc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
@@ -245,9 +245,11 @@ public class TikaUserDataBox extends Box {
             case "\u00A9lyr" :
                 xhtml.element("p", value);
                 break;
-            case "ldes" :
+            case "ldes" : //intentional fall through
+            case "desc" :
                 metadata.set(TikaCoreProperties.DESCRIPTION, value);
                 xhtml.element("p", value);
+                break;
             case "xid " :
                 //not sure this is the right use of this key
                 metadata.set(XMP.IDENTIFIER, value);