You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:10 UTC
[tika] 03/03: TIKA-3164 -- further tweaks
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bc9cca8761d20bd9a9f2f54ebcaf89ba093e2c82
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 10:09:41 2021 -0400
TIKA-3164 -- further tweaks
---
tika-bundle/pom.xml | 23 ++++++++++++++++++-
tika-eval/pom.xml | 26 +++++++++++++++++++++-
tika-parsers/pom.xml | 20 +++++++++++++++++
.../tika/parser/microsoft/SummaryExtractor.java | 1 -
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 1 -
.../microsoft/ooxml/OOXMLExtractorFactory.java | 9 +++-----
.../parser/microsoft/ooxml/OOXMLParserTest.java | 2 --
tika-parsers/src/test/resources/log4j.properties | 2 +-
.../apache/tika/server/resource/TikaResource.java | 2 +-
9 files changed, 72 insertions(+), 14 deletions(-)
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index c90cc85..aecfb25 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -203,7 +203,8 @@
bcpkix-jdk15on|
poi|poi-scratchpad|
poi-ooxml|
- poi-ooxml-schemas|
+ poi-ooxml-lite|
+ log4j-api|
commons-math3|
curvesapi|
xmlbeans|
@@ -266,6 +267,7 @@
!org.apache.spark.ml.*,
!org.apache.spark.mllib.*,
!org.apache.spark.sql.*,
+ !com.github.javaparser.*,
org.apache.tika.mime,
org.apache.tika.fork,
android.util;resolution:=optional,
@@ -287,11 +289,14 @@
com.parso;resolution:=optional,
com.sleepycat.je;resolution:=optional,
com.sun.javadoc;resolution:=optional,
+ com.sun.org.apache.xml.internal.resolver;resolution:=optional,
+ com.sun.org.apache.xml.internal.resolver.tools;resolution:=optional,
com.sun.xml.bind.marshaller;resolution:=optional,
com.sun.xml.internal.bind.marshaller;resolution:=optional,
com.sun.msv.datatype;resolution:=optional,
com.sun.msv.datatype.xsd;resolution:=optional,
com.sun.tools.javadoc;resolution:=optional,
+ de.rototor.pdfbox.graphics2d;resolution:=optional,
edu.mit.ll.mitie;resolution:=optional,
edu.stanford.nlp.*;resolution:=optional,
edu.wisc.ssec.mcidas;resolution:=optional,
@@ -324,15 +329,22 @@
net.didion.jwnl;resolution:=optional,
net.sf.saxon;resolution:=optional,
net.sf.saxon.dom;resolution:=optional,
+ net.sf.saxon.lib;resolution:=optional,
+ net.sf.saxon.ma.map;resolution:=optional,
net.sf.saxon.om;resolution:=optional,
net.sf.saxon.query;resolution:=optional,
net.sf.saxon.sxpath;resolution:=optional,
+ net.sf.saxon.trans;resolution:=optional,
+ net.sf.saxon.tree.wrapper;resolution:=optional,
+ net.sf.saxon.type;resolution:=optional,
net.sf.saxon.value;resolution:=optional,
org.apache.batik.anim.dom;resolution:=optional,
org.apache.batik.bridge;resolution:=optional,
+ org.apache.batik.dom;resolution:=optional,
org.apache.batik.ext.awt;resolution:=optional,
org.apache.batik.ext.awt.image.renderable;resolution:=optional,
org.apache.batik.gvt;resolution:=optional,
+ org.apache.batik.svggen;resolution:=optional,
org.apache.batik.util;resolution:=optional,
org.apache.cxf.jaxrs.client;resolution:=optional,
org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
@@ -349,6 +361,14 @@
org.apache.commons.vfs2.util;resolution:=optional,
org.apache.crimson.jaxp;resolution:=optional,
org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+ org.apache.logging.log4j;resolution:=optional,
+ org.apache.logging.log4j.message;resolution:=optional,
+ org.apache.logging.log4j.util;resolution:=optional,
+ org.apache.logging.log4j.util.internal;resolution:=optional,
+ org.apache.maven.model;resolution:=optional,
+ org.apache.maven.plugin;resolution:=optional,
+ org.apache.maven.plugin.logging;resolution:=optional,
+ org.apache.maven.project;resolution:=optional,
org.apache.pdfbox.debugger;resolution:=optional,
org.apache.pdfbox.preflight.*;resolution:=optional,
org.apache.sis;resolution:=optional,
@@ -438,6 +458,7 @@
org.slf4j.helpers;resolution:=optional,
org.sqlite;resolution:=optional,
org.w3c.dom;resolution:=optional,
+ org.w3c.dom.traversal;resolution:=optional,
org.relaxng.datatype;resolution:=optional,
org.xml.sax;resolution:=optional,
org.xml.sax.ext;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index d86c365..08359bf 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,11 +124,35 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-all</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-bridge</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-svggen</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-codec</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>jakarta.xml.bind</groupId>
+ <artifactId>jakarta.xml.bind-api</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml-schemas</artifactId>
+ <artifactId>poi-ooxml-lite</artifactId>
<version>${poi.version}</version>
</dependency>
<dependency>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index cb0e473..693515e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -275,6 +275,26 @@
<groupId>org.apache.xmlgraphics</groupId>
<artifactId>batik-all</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-bridge</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-svggen</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-codec</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>jakarta.xml.bind</groupId>
+ <artifactId>jakarta.xml.bind-api</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index ba98c0e..30c472d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -24,7 +24,6 @@ import java.util.Set;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.SummaryInformation;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b404da7..ba43c31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -380,7 +380,6 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
-
// Get the name
String name = part.getPartName().getName();
metadata.set(
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index ddc607a..fd265f5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -65,8 +65,6 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
-
/**
* Figures out the correct {@link OOXMLExtractor} for the supplied document and
* returns it.
@@ -78,7 +76,7 @@ public class OOXMLExtractorFactory {
private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
static {
- setThreadPrefersEventExtractors(true);
+ POIXMLExtractorFactory.setAllThreadsPreferEventExtractors(true);
}
public static void parse(
@@ -176,7 +174,6 @@ public class OOXMLExtractorFactory {
if (poiExtractor == null) {
poiExtractor = EXTRACTOR_FACTORY.create(pkg);
}
-
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
@@ -212,7 +209,6 @@ public class OOXMLExtractorFactory {
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
-
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
@@ -291,7 +287,8 @@ public class OOXMLExtractorFactory {
//TODO make this static...or find what happened to SUPPORTED_TYPES
XSLFRelation[] xslfRelations = new XSLFRelation[] {
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
- XSLFRelation.PRESENTATIONML_TEMPLATE
+ XSLFRelation.PRESENTATIONML,
+ XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
};
for (int i = 0; i < xslfRelations.length; i++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bdbc9e4..3e007b9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,7 +31,6 @@ import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
-import java.nio.file.Path;
import java.text.DecimalFormatSymbols;
import java.util.Arrays;
import java.util.HashMap;
@@ -43,7 +42,6 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.ctakes.typesystem.type.syntax.O;
import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
diff --git a/tika-parsers/src/test/resources/log4j.properties b/tika-parsers/src/test/resources/log4j.properties
index f2c0b92..d557c48 100644
--- a/tika-parsers/src/test/resources/log4j.properties
+++ b/tika-parsers/src/test/resources/log4j.properties
@@ -21,4 +21,4 @@ log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
# Pattern to output the caller's file name and line number.
-log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
+log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) ----- %m%n
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b326f7f..c77694d 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -22,7 +22,6 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.cxf.attachment.ContentDisposition;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.impl.MetadataMap;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
@@ -55,6 +54,7 @@ import org.apache.tika.server.ServerStatus;
import org.apache.tika.server.TikaServerParseException;
import org.apache.tika.utils.ExceptionUtils;
+import org.apache.poi.extractor.ExtractorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;