You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/23 10:33:51 UTC

[tika] 01/01: TIKA-3164 -- WIP POI 5.0.0

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7aa27326fcc285c52876a0ae3759d5fe93f62789
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 23 05:33:26 2021 -0500

    TIKA-3164 -- WIP POI 5.0.0
---
 tika-parent/pom.xml                                |  2 +-
 tika-parsers/pom.xml                               |  8 ++++++
 .../tika/parser/microsoft/OutlookExtractor.java    |  4 +--
 .../microsoft/ooxml/OOXMLExtractorFactory.java     | 30 ++++++++++++++--------
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |  4 +--
 .../ooxml/XSLFPowerPointExtractorDecorator.java    | 18 +++----------
 .../microsoft/ooxml/xps/XPSTextExtractor.java      | 25 ++++++++++++++++--
 .../xslf/XSLFEventBasedPowerPointExtractor.java    | 23 +++++++++++++++--
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    | 24 +++++++++++++++--
 9 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 626b1a2..baccf43 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -334,7 +334,7 @@
     <maven.shade.version>3.2.4</maven.shade.version>
     <rat.version>0.13</rat.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-    <poi.version>4.1.2</poi.version>
+    <poi.version>5.0.0</poi.version>
     <commons.compress.version>1.20</commons.compress.version>
     <commons.io.version>2.8.0</commons.io.version>
     <commons.lang3.version>3.11</commons.lang3.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index fa5e6f0..53e0a26 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -264,6 +264,14 @@
           <groupId>org.apache.commons</groupId>
           <artifactId>commons-compress</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.woodstox</groupId>
+          <artifactId>woodstox-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-all</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c2e27d6..b8c3d66 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -700,8 +700,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
 
         for (RecipientChunks chunks : recipientChunks) {
             Recipient r = new Recipient();
-            r.displayName = (chunks.recipientDisplayNameChunk != null) ? chunks.recipientDisplayNameChunk.toString() : null;
-            r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() : null;
+            r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ? chunks.getRecipientDisplayNameChunk().toString() : null;
+            r.name = (chunks.getRecipientNameChunk() != null) ? chunks.getRecipientNameChunk().toString() : null;
             r.emailAddress = chunks.getRecipientEmailAddress();
             List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 15f2c33..e2dc17e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,9 +22,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 
-import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@ -34,10 +33,9 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.util.LocaleUtil;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xslf.usermodel.XSLFSlideShow;
 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -64,6 +62,8 @@ import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
+
 /**
  * Figures out the correct {@link OOXMLExtractor} for the supplied document and
  * returns it.
@@ -72,13 +72,17 @@ public class OOXMLExtractorFactory {
 
     private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
     private static final int MAX_BUFFER_LENGTH = 1000000;
+    private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
+
+    static {
+        setThreadPrefersEventExtractors(true);
+    }
 
     public static void parse(
             InputStream stream, ContentHandler baseHandler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
-        ExtractorFactory.setThreadPrefersEventExtractors(true);
 
         //if there's a problem opening the zip file;
         //create a tmp file, and copy what you can read of it.
@@ -167,7 +171,7 @@ public class OOXMLExtractorFactory {
             }
 
             if (poiExtractor == null) {
-                poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
+                poiExtractor = EXTRACTOR_FACTORY.create(pkg);
             }
 
             POIXMLDocument document = poiExtractor.getDocument();
@@ -192,8 +196,8 @@ public class OOXMLExtractorFactory {
                                 "The extractor returned was a " + poiExtractor
                 );
             } else if (document instanceof XMLSlideShow) {
-                extractor = new XSLFPowerPointExtractorDecorator(
-                        context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
+                extractor = new XSLFPowerPointExtractorDecorator( metadata,
+                        context, (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor);
             } else if (document instanceof XWPFDocument) {
                 extractor = new XWPFWordExtractorDecorator( metadata,
                         context, (XWPFWordExtractor) poiExtractor);
@@ -279,7 +283,11 @@ public class OOXMLExtractorFactory {
         }
         String targetContentType = corePart.getContentType();
 
-        XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
+        //TODO make this static...or find what happened to SUPPORTED_TYPES
+        XSLFRelation[] xslfRelations = new XSLFRelation[] {
+                XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+                XSLFRelation.PRESENTATIONML_TEMPLATE
+        };
 
         for (int i = 0; i < xslfRelations.length; i++) {
             XSLFRelation xslfRelation = xslfRelations[i];
@@ -287,7 +295,7 @@ public class OOXMLExtractorFactory {
                 if (eventBased) {
                     return new XSLFEventBasedPowerPointExtractor(pkg);
                 } else {
-                    return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+                    return new XSLFExtractor(new XMLSlideShow(pkg));
                 }
             }
         }
@@ -296,7 +304,7 @@ public class OOXMLExtractorFactory {
             if (eventBased) {
                 return new XSLFEventBasedPowerPointExtractor(pkg);
             } else {
-                return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+                return new XSLFExtractor(new XMLSlideShow(pkg));
             }
         }
         return null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index ac6e278..5350f30 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -33,7 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -95,7 +95,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
     /**
-     * @see XSLFPowerPointExtractor#getText()
+     * @see XSLFExtractor#getText()
      */
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index c63fcb3..9b61d68 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -30,13 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.sl.usermodel.Placeholder;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFComment;
 import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
-import org.apache.poi.xslf.usermodel.XSLFComments;
 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
 import org.apache.poi.xslf.usermodel.XSLFHyperlink;
@@ -73,23 +71,13 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
     private Metadata metadata;
 
-    public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFPowerPointExtractor extractor) {
+    public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFExtractor extractor) {
         super(context, extractor);
         this.metadata = metadata;
     }
 
     /**
-     * use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata, ParseContext, XSLFPowerPointExtractor)}
-     * @param context
-     * @param extractor
-     */
-    @Deprecated
-    public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
-        this(new Metadata(),context, extractor);
-    }
-
-    /**
-     * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+     * @see org.apache.poi.xslf.extractor.XSLFExtractor#getText()
      */
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
         XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index 0212920..a590d39 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -25,6 +25,7 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.xmlbeans.XmlException;
 
+import java.io.Closeable;
 import java.io.IOException;
 
 /**
@@ -32,13 +33,12 @@ import java.io.IOException;
  * and keep the general framework similar to our other POI-integrated
  * extractors.
  */
-public class XPSTextExtractor extends POIXMLTextExtractor {
+public class XPSTextExtractor implements POIXMLTextExtractor {
 
     private final OPCPackage pkg;
     private final POIXMLProperties properties;
 
     public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
-        super((POIXMLDocument)null);
         this.pkg = pkg;
         this.properties = new POIXMLProperties(pkg);
 
@@ -53,6 +53,22 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
     public String getText() {
         return null;
     }
+
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
+
     public POIXMLProperties.CoreProperties getCoreProperties() {
         return this.properties.getCoreProperties();
     }
@@ -64,4 +80,9 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
     public POIXMLProperties.CustomProperties getCustomProperties() {
         return this.properties.getCustomProperties();
     }
+
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index bd5615d..4f666a7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -17,6 +17,7 @@
 
 package org.apache.tika.parser.microsoft.ooxml.xslf;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.util.Date;
 
@@ -31,7 +32,7 @@ import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.xmlbeans.XmlException;
 
-public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor {
 
 
     private OPCPackage container;
@@ -42,7 +43,6 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
     }
 
     public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
-        super((POIXMLDocument) null);
         this.container = container;
         this.properties = new POIXMLProperties(container);
     }
@@ -76,6 +76,11 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         return this.properties.getCustomProperties();
     }
 
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
+
 
     @Override
     public String getText() {
@@ -83,6 +88,20 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         return "";
     }
 
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
 
 
     private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 866bb78..d2ef6db 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -18,6 +18,7 @@
 package org.apache.tika.parser.microsoft.ooxml.xwpf;
 
 import javax.xml.parsers.ParserConfigurationException;
+import java.io.Closeable;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Date;
@@ -54,7 +55,7 @@ import org.xml.sax.XMLReader;
  * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
  *
  */
-public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor {
 
     private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
 
@@ -66,7 +67,6 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
     }
 
     public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
-        super((POIXMLDocument) null);
         this.container = container;
         this.properties = new POIXMLProperties(container);
     }
@@ -100,6 +100,11 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return this.properties.getCustomProperties();
     }
 
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
+
 
     @Override
     public String getText() {
@@ -139,6 +144,21 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return sb.toString();
     }
 
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
+
 
     private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
         //load the numbering/list manager and styles from the main document part