You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:10 UTC

[tika] 03/03: TIKA-3164 -- further tweaks

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bc9cca8761d20bd9a9f2f54ebcaf89ba093e2c82
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 10:09:41 2021 -0400

    TIKA-3164 -- further tweaks
---
 tika-bundle/pom.xml                                | 23 ++++++++++++++++++-
 tika-eval/pom.xml                                  | 26 +++++++++++++++++++++-
 tika-parsers/pom.xml                               | 20 +++++++++++++++++
 .../tika/parser/microsoft/SummaryExtractor.java    |  1 -
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  1 -
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  9 +++-----
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  2 --
 tika-parsers/src/test/resources/log4j.properties   |  2 +-
 .../apache/tika/server/resource/TikaResource.java  |  2 +-
 9 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index c90cc85..aecfb25 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -203,7 +203,8 @@
               bcpkix-jdk15on|
               poi|poi-scratchpad|
               poi-ooxml|
-              poi-ooxml-schemas|
+              poi-ooxml-lite|
+              log4j-api|
               commons-math3|
               curvesapi|
               xmlbeans|
@@ -266,6 +267,7 @@
 	      !org.apache.spark.ml.*,
 	      !org.apache.spark.mllib.*,
 	      !org.apache.spark.sql.*,
+              !com.github.javaparser.*,
 	          org.apache.tika.mime,
               org.apache.tika.fork,
               android.util;resolution:=optional,
@@ -287,11 +289,14 @@
               com.parso;resolution:=optional,
               com.sleepycat.je;resolution:=optional,
               com.sun.javadoc;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver.tools;resolution:=optional,
               com.sun.xml.bind.marshaller;resolution:=optional,
               com.sun.xml.internal.bind.marshaller;resolution:=optional,
               com.sun.msv.datatype;resolution:=optional,
               com.sun.msv.datatype.xsd;resolution:=optional,
               com.sun.tools.javadoc;resolution:=optional,
+              de.rototor.pdfbox.graphics2d;resolution:=optional,
               edu.mit.ll.mitie;resolution:=optional,
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
@@ -324,15 +329,22 @@
               net.didion.jwnl;resolution:=optional,
               net.sf.saxon;resolution:=optional,
               net.sf.saxon.dom;resolution:=optional,
+              net.sf.saxon.lib;resolution:=optional,
+              net.sf.saxon.ma.map;resolution:=optional,
               net.sf.saxon.om;resolution:=optional,
               net.sf.saxon.query;resolution:=optional,
               net.sf.saxon.sxpath;resolution:=optional,
+              net.sf.saxon.trans;resolution:=optional,
+              net.sf.saxon.tree.wrapper;resolution:=optional,
+              net.sf.saxon.type;resolution:=optional,
               net.sf.saxon.value;resolution:=optional,
               org.apache.batik.anim.dom;resolution:=optional,
               org.apache.batik.bridge;resolution:=optional,
+              org.apache.batik.dom;resolution:=optional,
               org.apache.batik.ext.awt;resolution:=optional,
               org.apache.batik.ext.awt.image.renderable;resolution:=optional,
               org.apache.batik.gvt;resolution:=optional,
+              org.apache.batik.svggen;resolution:=optional,
               org.apache.batik.util;resolution:=optional,
               org.apache.cxf.jaxrs.client;resolution:=optional,
               org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
@@ -349,6 +361,14 @@
               org.apache.commons.vfs2.util;resolution:=optional,
               org.apache.crimson.jaxp;resolution:=optional,
               org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+              org.apache.logging.log4j;resolution:=optional,
+              org.apache.logging.log4j.message;resolution:=optional,
+              org.apache.logging.log4j.util;resolution:=optional,
+              org.apache.logging.log4j.util.internal;resolution:=optional,
+              org.apache.maven.model;resolution:=optional,
+              org.apache.maven.plugin;resolution:=optional,
+              org.apache.maven.plugin.logging;resolution:=optional,
+              org.apache.maven.project;resolution:=optional,
               org.apache.pdfbox.debugger;resolution:=optional,
               org.apache.pdfbox.preflight.*;resolution:=optional,
               org.apache.sis;resolution:=optional,
@@ -438,6 +458,7 @@
               org.slf4j.helpers;resolution:=optional,
               org.sqlite;resolution:=optional,
               org.w3c.dom;resolution:=optional,
+              org.w3c.dom.traversal;resolution:=optional,
               org.relaxng.datatype;resolution:=optional,
               org.xml.sax;resolution:=optional,
               org.xml.sax.ext;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index d86c365..08359bf 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,11 +124,35 @@
                     <groupId>org.apache.commons</groupId>
                     <artifactId>commons-compress</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-all</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-bridge</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-svggen</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-codec</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.pdfbox</groupId>
+                    <artifactId>pdfbox</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>jakarta.xml.bind</groupId>
+                    <artifactId>jakarta.xml.bind-api</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.poi</groupId>
-            <artifactId>poi-ooxml-schemas</artifactId>
+            <artifactId>poi-ooxml-lite</artifactId>
             <version>${poi.version}</version>
         </dependency>
         <dependency>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index cb0e473..693515e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -275,6 +275,26 @@
           <groupId>org.apache.xmlgraphics</groupId>
           <artifactId>batik-all</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-bridge</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-svggen</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-codec</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.pdfbox</groupId>
+          <artifactId>pdfbox</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>jakarta.xml.bind</groupId>
+          <artifactId>jakarta.xml.bind-api</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index ba98c0e..30c472d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -24,7 +24,6 @@ import java.util.Set;
 
 import org.apache.poi.hpsf.CustomProperties;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
 import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.SummaryInformation;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b404da7..ba43c31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -380,7 +380,6 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             throws SAXException, IOException {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
-
         // Get the name
         String name = part.getPartName().getName();
         metadata.set(
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index ddc607a..fd265f5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -65,8 +65,6 @@ import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
-
 /**
  * Figures out the correct {@link OOXMLExtractor} for the supplied document and
  * returns it.
@@ -78,7 +76,7 @@ public class OOXMLExtractorFactory {
     private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
 
     static {
-        setThreadPrefersEventExtractors(true);
+        POIXMLExtractorFactory.setAllThreadsPreferEventExtractors(true);
     }
 
     public static void parse(
@@ -176,7 +174,6 @@ public class OOXMLExtractorFactory {
             if (poiExtractor == null) {
                 poiExtractor = EXTRACTOR_FACTORY.create(pkg);
             }
-
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
                 extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
@@ -212,7 +209,6 @@ public class OOXMLExtractorFactory {
             // Get the bulk of the metadata first, so that it's accessible during
             //  parsing if desired by the client (see TIKA-1109)
             extractor.getMetadataExtractor().extract(metadata);
-
             // Extract the text, along with any in-document metadata
             extractor.getXHTML(baseHandler, metadata, context);
         } catch (IllegalArgumentException e) {
@@ -291,7 +287,8 @@ public class OOXMLExtractorFactory {
         //TODO make this static...or find what happened to SUPPORTED_TYPES
         XSLFRelation[] xslfRelations = new XSLFRelation[] {
                 XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
-                XSLFRelation.PRESENTATIONML_TEMPLATE
+                XSLFRelation.PRESENTATIONML,
+                XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
         };
 
         for (int i = 0; i < xslfRelations.length; i++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bdbc9e4..3e007b9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,7 +31,6 @@ import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
-import java.nio.file.Path;
 import java.text.DecimalFormatSymbols;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -43,7 +42,6 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.ctakes.typesystem.type.syntax.O;
 import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
diff --git a/tika-parsers/src/test/resources/log4j.properties b/tika-parsers/src/test/resources/log4j.properties
index f2c0b92..d557c48 100644
--- a/tika-parsers/src/test/resources/log4j.properties
+++ b/tika-parsers/src/test/resources/log4j.properties
@@ -21,4 +21,4 @@ log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 
 # Pattern to output the caller's file name and line number.
-log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
+log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) ----- %m%n
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b326f7f..c77694d 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -22,7 +22,6 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.cxf.attachment.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.impl.MetadataMap;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
@@ -55,6 +54,7 @@ import org.apache.tika.server.ServerStatus;
 import org.apache.tika.server.TikaServerParseException;
 import org.apache.tika.utils.ExceptionUtils;
 
+import org.apache.poi.extractor.ExtractorFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;