You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/03/27 04:22:24 UTC
tika git commit: TIKA-1910 - Make Web and Package optional in Office.
Repository: tika
Updated Branches:
refs/heads/2.x a38c4271e -> 84bf06285
TIKA-1910 - Make Web and Package optional in Office.
Remove POI from package parser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/84bf0628
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/84bf0628
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/84bf0628
Branch: refs/heads/2.x
Commit: 84bf06285451be911a7c711856634a69733a47a1
Parents: a38c427
Author: Bob Paulin <bo...@apache.org>
Authored: Sat Mar 26 21:22:15 2016 -0500
Committer: Bob Paulin <bo...@apache.org>
Committed: Sat Mar 26 21:22:15 2016 -0500
----------------------------------------------------------------------
.../module/BundleIT.java | 89 -----------
.../tika-parser-advanced-bundle/pom.xml | 1 -
.../tika-parser-office-bundle/pom.xml | 3 +
.../tika-parser-package-bundle/pom.xml | 58 +------
.../tika-parser-database-module/pom.xml | 6 +
.../tika-parser-office-module/pom.xml | 7 +
.../org/apache/tika/parser/chm/ChmParser.java | 11 +-
.../microsoft/AbstractPOIFSExtractor.java | 7 +-
.../parser/microsoft/JackcessExtractor.java | 8 +-
.../tika/parser/microsoft/OutlookExtractor.java | 24 +--
.../microsoft/ooxml/OOXMLExtractorFactory.java | 4 +-
.../org/apache/tika/parser/opc/OPCDetector.java | 155 +++++++++++++++++++
.../tika-parser-package-module/pom.xml | 23 +--
.../tika/parser/pkg/ZipContainerDetector.java | 113 ++------------
14 files changed, 225 insertions(+), 284 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java b/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
deleted file mode 100644
index c446ee8..0000000
--- a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
-import static org.ops4j.pax.exam.CoreOptions.bundle;
-import static org.ops4j.pax.exam.CoreOptions.junitBundles;
-import static org.ops4j.pax.exam.CoreOptions.options;
-import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
-
-import javax.inject.Inject;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.net.URISyntaxException;
-import java.util.Dictionary;
-
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.osgi.TikaService;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.ops4j.pax.exam.Configuration;
-import org.ops4j.pax.exam.Option;
-import org.ops4j.pax.exam.junit.PaxExam;
-import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
-import org.ops4j.pax.exam.spi.reactors.PerMethod;
-import org.osgi.framework.Bundle;
-import org.osgi.framework.BundleContext;
-import org.xml.sax.ContentHandler;
-
-@RunWith(PaxExam.class)
-@ExamReactorStrategy(PerMethod.class)
-public class BundleIT {
-
- private static final String BUNDLE_JAR_SYS_PROP = "project.bundle.file";
-
- @Inject
- private BundleContext bc;
-
- @Configuration
- public Option[] configuration() throws IOException, URISyntaxException {
- String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP);
-
- return options(junitBundles(),
- bundle(new File("target/test-bundles/tika-core.jar").toURI().toURL().toString()),
- bundle(new File(bundleFileName).toURI().toString()));
- }
-
- @Test
- public void testBundleLoaded() throws Exception {
- boolean hasCore = false, hasBundle = false;
- for (Bundle b : bc.getBundles()) {
- if ("org.apache.tika.core".equals(b.getSymbolicName())) {
- hasCore = true;
- assertEquals("Core not activated", Bundle.ACTIVE, b.getState());
- }
- if ("org.apache.tika.parser-advanced-module".equals(b.getSymbolicName())) {
- hasBundle = true;
- assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState());
- }
- }
- assertTrue("Core bundle not found", hasCore);
- assertTrue("Advanced bundle not found", hasBundle);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
index 2339483..28713fa 100644
--- a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
@@ -63,7 +63,6 @@
*,
opennlp.maxent;resolution:=optional,
opennlp.tools.namefind;resolution:=optional,
- org.apache.commons.io;resolution:=optional,
org.json;resolution:=optional,
org.osgi.framework;resolution:=optional,
net.didion.jwnl;resolution:=optional
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index cd6ef7f..c9db0da 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -82,6 +82,7 @@
org.apache.tika.parser.mbox.*,
org.apache.tika.parser.microsoft.*,
org.apache.tika.parser.microsoft.ooxml.*,
+ org.apache.tika.parser.opc.*,
org.apache.tika.parser.odf.*,
org.apache.tika.parser.opendocument.*,
org.apache.tika.parser.rtf.*
@@ -122,6 +123,8 @@
org.etsi.uri.x01903.v14;resolution:=optional,
org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+ org.apache.tika.parser.html.HtmlParser;resolution:=optional,
+ org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
</Import-Package>
</instructions>
</configuration>
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-package-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-package-bundle/pom.xml b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
index bbd917f..4d292d7 100644
--- a/tika-parser-bundles/tika-parser-package-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
@@ -50,13 +50,8 @@
commons-io;inline=true,
commons-codec;inline=true,
xz;inline=true,
- poi;inline=true,
- poi-ooxml;inline=true,
- poi-ooxml-schemas;inline=true,
- xmlbeans;inline=true,
commons-compress;inline=true,
- junrar;inline=true,
- curvesapi;inline=true
+ junrar;inline=true
</Embed-Dependency>
<Embed-Transitive>true</Embed-Transitive>
<Export-Package>
@@ -64,60 +59,11 @@
org.apache.tika.parser.iwork.*
</Export-Package>
<Import-Package>
- !org.junit,
- !org.junit.*,
- !junit.*,
*,
- com.microsoft.schemas.office.powerpoint;resolution:=optional,
- com.microsoft.schemas.office.word;resolution:=optional,
org.apache.commons.vfs2;resolution:=optional,
org.apache.commons.vfs2.provider;resolution:=optional,
org.apache.commons.vfs2.util;resolution:=optional,
- org.apache.crimson.jaxp;resolution:=optional,
- org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
- org.apache.xml.resolver;resolution:=optional,
- org.apache.xml.resolver.tools;resolution:=optional,
- org.apache.xml.security;resolution:=optional,
- org.apache.xml.security.c14n;resolution:=optional,
- org.apache.xml.security.utils;resolution:=optional,
- org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
- org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
- com.sun.javadoc;resolution:=optional,
- com.sun.xml.bind.marshaller;resolution:=optional,
- com.sun.xml.internal.bind.marshaller;resolution:=optional,
- com.sun.msv.datatype;resolution:=optional,
- com.sun.msv.datatype.xsd;resolution:=optional,
- com.sun.tools.javadoc;resolution:=optional,
- org.apache.poi.hdgf.extractor;resolution:=optional,
- org.apache.poi.hpbf.extractor;resolution:=optional,
- org.apache.poi.hslf.blip;resolution:=optional,
- org.apache.poi.hslf.extractor;resolution:=optional,
- org.apache.poi.hsmf;resolution:=optional,
- org.apache.poi.hsmf.datatypes;resolution:=optional,
- org.apache.poi.hsmf.extractor;resolution:=optional,
- org.apache.poi.hwpf;resolution:=optional,
- org.apache.poi.hwpf.extractor;resolution:=optional,
- org.apache.tools.ant;resolution:=optional,
- org.apache.tools.ant.taskdefs;resolution:=optional,
- org.apache.tools.ant.types;resolution:=optional,
- org.bouncycastle.asn1;resolution:=optional,
- org.bouncycastle.asn1.cmp;resolution:=optional,
- org.bouncycastle.asn1.nist;resolution:=optional,
- org.bouncycastle.asn1.ocsp;resolution:=optional,
- org.bouncycastle.asn1.x500;resolution:=optional,
- org.bouncycastle.asn1.x509;resolution:=optional,
- org.bouncycastle.cert;resolution:=optional,
- org.bouncycastle.cert.jcajce;resolution:=optional,
- org.bouncycastle.cert.ocsp;resolution:=optional,
- org.bouncycastle.cms;resolution:=optional,
- org.bouncycastle.cms.bc;resolution:=optional,
- org.bouncycastle.operator;resolution:=optional,
- org.bouncycastle.operator.bc;resolution:=optional,
- org.bouncycastle.tsp;resolution:=optional,
- org.bouncycastle.util;resolution:=optional,
- org.etsi.uri.x01903.v14;resolution:=optional,
- org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
- org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+
</Import-Package>
</instructions>
</configuration>
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-database-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/pom.xml b/tika-parser-modules/tika-parser-database-module/pom.xml
index fd47f3d..a60dae3 100644
--- a/tika-parser-modules/tika-parser-database-module/pom.xml
+++ b/tika-parser-modules/tika-parser-database-module/pom.xml
@@ -47,6 +47,12 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/pom.xml b/tika-parser-modules/tika-parser-office-module/pom.xml
index 3a8e5d2..689c133 100644
--- a/tika-parser-modules/tika-parser-office-module/pom.xml
+++ b/tika-parser-modules/tika-parser-office-module/pom.xml
@@ -30,6 +30,11 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
@@ -73,11 +78,13 @@
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-package-module</artifactId>
<version>${project.version}</version>
+ <scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-web-module</artifactId>
<version>${project.version}</version>
+ <scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
index 7c43995..c3e85c1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
@@ -29,9 +29,10 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -49,6 +50,11 @@ public class ChmParser extends AbstractParser {
MediaType.application("chm"),
MediaType.application("x-chm"))));
+ private final Parser htmlProxy;
+
+ public ChmParser() {
+ this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
+ }
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -91,12 +97,11 @@ public class ChmParser extends AbstractParser {
private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
InputStream stream = null;
Metadata metadata = new Metadata();
- HtmlParser htmlParser = new HtmlParser();
ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
ParseContext parser = new ParseContext();
try {
stream = new ByteArrayInputStream(byteObject);
- htmlParser.parse(stream, handler, metadata, parser);
+ htmlProxy.parse(stream, handler, metadata, parser);
} catch (SAXException e) {
throw new RuntimeException(e);
} catch (IOException e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index e2acb52..320cc74 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -31,6 +31,7 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.hpsf.ClassID;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.DetectorProxy;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
@@ -43,7 +44,6 @@ import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
-import org.apache.tika.parser.pkg.ZipContainerDetector;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -55,6 +55,7 @@ abstract class AbstractPOIFSExtractor {
private MimeTypes mimeTypes;
private Detector detector;
private Metadata metadata;
+ private final Detector zipDetectorProxy;
protected AbstractPOIFSExtractor(ParseContext context) {
this(context, null);
@@ -74,6 +75,7 @@ abstract class AbstractPOIFSExtractor {
this.mimeTypes = context.get(MimeTypes.class);
this.detector = context.get(Detector.class);
this.metadata = metadata;
+ this.zipDetectorProxy = new DetectorProxy("org.apache.tika.parser.pkg.ZipContainerDetector", getClass().getClassLoader());
}
// Note - these cache, but avoid creating the default TikaConfig if not needed
@@ -159,8 +161,7 @@ abstract class AbstractPOIFSExtractor {
try (TikaInputStream stream = TikaInputStream.get(
new DocumentInputStream((DocumentEntry) ooxml))) {
- ZipContainerDetector detector = new ZipContainerDetector();
- MediaType type = detector.detect(stream, new Metadata());
+ MediaType type = zipDetectorProxy.detect(stream, new Metadata());
handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
return;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index e224d54..345dd24 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -47,7 +47,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -71,10 +72,11 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
final NumberFormat currencyFormatter;
final DateFormat shortDateTimeFormatter;
- final HtmlParser htmlParser = new HtmlParser();
+ private final Parser htmlParserProxy;
protected JackcessExtractor(ParseContext context, Locale locale) {
super(context);
+ this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader());
currencyFormatter = NumberFormat.getCurrencyInstance(locale);
shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale);
}
@@ -200,7 +202,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
Metadata m = new Metadata();
m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
try {
- htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
+ htmlParserProxy.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
h,
m, EMPTY_PARSE_CONTEXT);
handler.characters(h.toString());
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 3a85882..108d5eb 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -23,6 +23,7 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
@@ -44,14 +45,15 @@ import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingDetectorProxy;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.html.HtmlEncodingDetector;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
@@ -67,7 +69,9 @@ import static java.nio.charset.StandardCharsets.UTF_8;
*/
public class OutlookExtractor extends AbstractPOIFSExtractor {
private static final Metadata EMPTY_METADATA = new Metadata();
- HtmlEncodingDetector detector = new HtmlEncodingDetector();
+ private final SimpleDateFormat dateFormat;
+ private final EncodingDetector htmlEncodingDetectorProxy;
+ private final Parser htmlParserProxy;
private final MAPIMessage msg;
@@ -77,7 +81,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
super(context);
-
+ this.htmlEncodingDetectorProxy = new EncodingDetectorProxy("org.apache.tika.parser.html.HtmlEncodingDetector", getClass().getClassLoader());
+ this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader());
+ this.dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
try {
this.msg = new MAPIMessage(root);
} catch (IOException e) {
@@ -135,7 +141,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
// See if we can parse it as a normal mail date
try {
- Date d = MboxParser.parseDate(date);
+
+ Date d = dateFormat.parse(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
@@ -196,8 +203,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
- HtmlParser htmlParser = new HtmlParser();
- htmlParser.parse(
+ htmlParserProxy.parse(
new ByteArrayInputStream(data),
new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
new Metadata(), new ParseContext()
@@ -341,7 +347,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
if(html != null && html.length() > 0) {
Charset charset = null;
try {
- charset = detector.detect(new ByteArrayInputStream(
+ charset = htmlEncodingDetectorProxy.detect(new ByteArrayInputStream(
html.getBytes(UTF_8)), EMPTY_METADATA);
} catch (IOException e) {
//swallow
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index add1f2c..518a000 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -39,7 +39,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.parser.opc.OPCDetector;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -74,7 +74,7 @@ public class OOXMLExtractorFactory {
}
// Get the type, and ensure it's one we handle
- MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+ MediaType type = OPCDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
new file mode 100644
index 0000000..cc17459
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector that detects OPC Packages
+ *
+ */
+public class OPCDetector implements Detector {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -3569622763024617244L;
+
+ private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String VISIO_DOCUMENT =
+ "http://schemas.microsoft.com/visio/2010/relationships/document";
+
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String STRICT_CORE_DOCUMENT =
+ "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream stream = TikaInputStream.get(input, tmp);
+ // Use POI to open and investigate it for us
+ OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+ stream.setOpenContainer(pkg);
+
+ // Is at an OOXML format?
+ MediaType type = detectOfficeOpenXML(pkg);
+ if (type != null) return type;
+
+ // Is it XPS format?
+ type = detectXPSOPC(pkg);
+ if (type != null) return type;
+
+ // Is it an AutoCAD format?
+ type = detectAutoCADOPC(pkg);
+
+ return type;
+ } catch (InvalidFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }finally {
+ tmp.close();
+ }
+ return null;
+ }
+
+ /**
+ * Detects the type of an OfficeOpenXML (OOXML) file from
+ * opened Package
+ */
+ public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+ // Check for the normal Office core document
+ PackageRelationshipCollection core =
+ pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+ // Otherwise check for some other Office core document types
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+ }
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+ }
+
+ // If we didn't find a single core document of any type, skip detection
+ if (core.size() != 1) {
+ // Invalid OOXML Package received
+ return null;
+ }
+
+ // Get the type of the core document part
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+ String coreType = corePart.getContentType();
+
+ // Turn that into the type of the overall document
+ String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+ // The Macro Enabled formats are a little special
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+ docType = docType.toLowerCase(Locale.ROOT) + ".12";
+ }
+
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+ docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+ }
+
+ // Build the MediaType object and return
+ return MediaType.parse(docType);
+ }
+ /**
+ * Detects Open XML Paper Specification (XPS)
+ */
+ private static MediaType detectXPSOPC(OPCPackage pkg) {
+ PackageRelationshipCollection xps =
+ pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+ if (xps.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ } else {
+ // Non-XPS Package received
+ return null;
+ }
+ }
+ /**
+ * Detects AutoCAD formats that live in OPC packaging
+ */
+ private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+ PackageRelationshipCollection dwfxSeq =
+ pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+ if (dwfxSeq.size() == 1) {
+ return MediaType.parse("model/vnd.dwfx+xps");
+ } else {
+ // Non-AutoCAD Package received
+ return null;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml
index 2e226d2..8d1238d 100644
--- a/tika-parser-modules/tika-parser-package-module/pom.xml
+++ b/tika-parser-modules/tika-parser-package-module/pom.xml
@@ -35,21 +35,6 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>${poi.version}</version>
- <exclusions>
- <exclusion>
- <groupId>stax</groupId>
- <artifactId>stax-api</artifactId>
- </exclusion>
- <exclusion>
- <groupId>xml-apis</groupId>
- <artifactId>xml-apis</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>${tukaani.version}</version>
@@ -69,13 +54,17 @@
<artifactId>commons-compress</artifactId>
<version>${commons.compress.version}</version>
</dependency>
- <dependency>
+ <dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-text-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
-
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 9ca6729..8276e9a 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -22,7 +22,6 @@ import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
-import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
@@ -36,12 +35,7 @@ import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.AbstractDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
@@ -57,18 +51,16 @@ import static java.nio.charset.StandardCharsets.UTF_8;
* A detector that works on Zip documents and other archive and compression
* formats to figure out exactly what the file is.
*/
-public class ZipContainerDetector implements Detector {
- private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+public class ZipContainerDetector extends AbstractDetector {
- // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
- private static final String VISIO_DOCUMENT =
- "http://schemas.microsoft.com/visio/2010/relationships/document";
- // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
- private static final String STRICT_CORE_DOCUMENT =
- "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
-
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
+
+ private final Detector opcDetector;
+
+ public ZipContainerDetector() {
+ this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
+ }
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
@@ -138,7 +130,7 @@ public class ZipContainerDetector implements Detector {
}
}
- private static MediaType detectZipFormat(TikaInputStream tis) {
+ private MediaType detectZipFormat(TikaInputStream tis) {
try {
ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
try {
@@ -199,24 +191,11 @@ public class ZipContainerDetector implements Detector {
}
}
- private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+ private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
try {
if (zip.getEntry("_rels/.rels") != null
|| zip.getEntry("[Content_Types].xml") != null) {
- // Use POI to open and investigate it for us
- OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
- stream.setOpenContainer(pkg);
-
- // Is at an OOXML format?
- MediaType type = detectOfficeOpenXML(pkg);
- if (type != null) return type;
-
- // Is it XPS format?
- type = detectXPSOPC(pkg);
- if (type != null) return type;
-
- // Is it an AutoCAD format?
- type = detectAutoCADOPC(pkg);
+ MediaType type = this.opcDetector.detect(stream, null);
if (type != null) return type;
// We don't know what it is, sorry
@@ -228,77 +207,9 @@ public class ZipContainerDetector implements Detector {
return null;
} catch (RuntimeException e) {
return null;
- } catch (InvalidFormatException e) {
- return null;
- }
- }
- /**
- * Detects the type of an OfficeOpenXML (OOXML) file from
- * opened Package
- */
- public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
- // Check for the normal Office core document
- PackageRelationshipCollection core =
- pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
- // Otherwise check for some other Office core document types
- if (core.size() == 0) {
- core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
- }
- if (core.size() == 0) {
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
- }
-
- // If we didn't find a single core document of any type, skip detection
- if (core.size() != 1) {
- // Invalid OOXML Package received
- return null;
- }
-
- // Get the type of the core document part
- PackagePart corePart = pkg.getPart(core.getRelationship(0));
- String coreType = corePart.getContentType();
-
- // Turn that into the type of the overall document
- String docType = coreType.substring(0, coreType.lastIndexOf('.'));
-
- // The Macro Enabled formats are a little special
- if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
- docType = docType.toLowerCase(Locale.ROOT) + ".12";
- }
-
- if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
- docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
- }
-
- // Build the MediaType object and return
- return MediaType.parse(docType);
- }
- /**
- * Detects Open XML Paper Specification (XPS)
- */
- private static MediaType detectXPSOPC(OPCPackage pkg) {
- PackageRelationshipCollection xps =
- pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
- if (xps.size() == 1) {
- return MediaType.application("vnd.ms-xpsdocument");
- } else {
- // Non-XPS Package received
- return null;
- }
- }
- /**
- * Detects AutoCAD formats that live in OPC packaging
- */
- private static MediaType detectAutoCADOPC(OPCPackage pkg) {
- PackageRelationshipCollection dwfxSeq =
- pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
- if (dwfxSeq.size() == 1) {
- return MediaType.parse("model/vnd.dwfx+xps");
- } else {
- // Non-AutoCAD Package received
- return null;
}
}
+
private static MediaType detectIWork(ZipFile zip) {
if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {