You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/03/27 04:22:24 UTC

tika git commit: TIKA-1910 - Make Web and Package optional in Office.

Repository: tika
Updated Branches:
  refs/heads/2.x a38c4271e -> 84bf06285


TIKA-1910 - Make Web and Package optional in Office. 

Remove POI from package parser

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/84bf0628
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/84bf0628
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/84bf0628

Branch: refs/heads/2.x
Commit: 84bf06285451be911a7c711856634a69733a47a1
Parents: a38c427
Author: Bob Paulin <bo...@apache.org>
Authored: Sat Mar 26 21:22:15 2016 -0500
Committer: Bob Paulin <bo...@apache.org>
Committed: Sat Mar 26 21:22:15 2016 -0500

----------------------------------------------------------------------
 .../module/BundleIT.java                        |  89 -----------
 .../tika-parser-advanced-bundle/pom.xml         |   1 -
 .../tika-parser-office-bundle/pom.xml           |   3 +
 .../tika-parser-package-bundle/pom.xml          |  58 +------
 .../tika-parser-database-module/pom.xml         |   6 +
 .../tika-parser-office-module/pom.xml           |   7 +
 .../org/apache/tika/parser/chm/ChmParser.java   |  11 +-
 .../microsoft/AbstractPOIFSExtractor.java       |   7 +-
 .../parser/microsoft/JackcessExtractor.java     |   8 +-
 .../tika/parser/microsoft/OutlookExtractor.java |  24 +--
 .../microsoft/ooxml/OOXMLExtractorFactory.java  |   4 +-
 .../org/apache/tika/parser/opc/OPCDetector.java | 155 +++++++++++++++++++
 .../tika-parser-package-module/pom.xml          |  23 +--
 .../tika/parser/pkg/ZipContainerDetector.java   | 113 ++------------
 14 files changed, 225 insertions(+), 284 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java b/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
deleted file mode 100644
index c446ee8..0000000
--- a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
-import static org.ops4j.pax.exam.CoreOptions.bundle;
-import static org.ops4j.pax.exam.CoreOptions.junitBundles;
-import static org.ops4j.pax.exam.CoreOptions.options;
-import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
-
-import javax.inject.Inject;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.net.URISyntaxException;
-import java.util.Dictionary;
-
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.osgi.TikaService;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.ops4j.pax.exam.Configuration;
-import org.ops4j.pax.exam.Option;
-import org.ops4j.pax.exam.junit.PaxExam;
-import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
-import org.ops4j.pax.exam.spi.reactors.PerMethod;
-import org.osgi.framework.Bundle;
-import org.osgi.framework.BundleContext;
-import org.xml.sax.ContentHandler;
-
-@RunWith(PaxExam.class)
-@ExamReactorStrategy(PerMethod.class)
-public class BundleIT {
-
-    private static final String BUNDLE_JAR_SYS_PROP = "project.bundle.file";
-    
-    @Inject
-    private BundleContext bc;
-
-    @Configuration
-    public Option[] configuration() throws IOException, URISyntaxException {
-        String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP);
-
-        return options(junitBundles(), 
-                bundle(new File("target/test-bundles/tika-core.jar").toURI().toURL().toString()),
-                bundle(new File(bundleFileName).toURI().toString()));
-    }
-
-    @Test
-    public void testBundleLoaded() throws Exception {
-        boolean hasCore = false, hasBundle = false;
-        for (Bundle b : bc.getBundles()) {
-            if ("org.apache.tika.core".equals(b.getSymbolicName())) {
-                hasCore = true;
-                assertEquals("Core not activated", Bundle.ACTIVE, b.getState());
-            }
-            if ("org.apache.tika.parser-advanced-module".equals(b.getSymbolicName())) {
-                hasBundle = true;
-                assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState());
-            }
-        }
-        assertTrue("Core bundle not found", hasCore);
-        assertTrue("Advanced bundle not found", hasBundle);
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
index 2339483..28713fa 100644
--- a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
@@ -63,7 +63,6 @@
               *,
               opennlp.maxent;resolution:=optional,
               opennlp.tools.namefind;resolution:=optional,
-              org.apache.commons.io;resolution:=optional,
               org.json;resolution:=optional,
               org.osgi.framework;resolution:=optional,
               net.didion.jwnl;resolution:=optional

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index cd6ef7f..c9db0da 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -82,6 +82,7 @@
               org.apache.tika.parser.mbox.*,
               org.apache.tika.parser.microsoft.*,
               org.apache.tika.parser.microsoft.ooxml.*,
+              org.apache.tika.parser.opc.*,
               org.apache.tika.parser.odf.*,
               org.apache.tika.parser.opendocument.*,
               org.apache.tika.parser.rtf.*
@@ -122,6 +123,8 @@
               org.etsi.uri.x01903.v14;resolution:=optional,
               org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
               org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+              org.apache.tika.parser.html.HtmlParser;resolution:=optional,
+              org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
             </Import-Package>
           </instructions>
         </configuration>

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-package-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-package-bundle/pom.xml b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
index bbd917f..4d292d7 100644
--- a/tika-parser-bundles/tika-parser-package-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
@@ -50,13 +50,8 @@
               commons-io;inline=true,
               commons-codec;inline=true,
               xz;inline=true,
-              poi;inline=true,
-              poi-ooxml;inline=true,
-              poi-ooxml-schemas;inline=true,
-              xmlbeans;inline=true,
               commons-compress;inline=true,
-              junrar;inline=true,
-              curvesapi;inline=true
+              junrar;inline=true
             </Embed-Dependency>
             <Embed-Transitive>true</Embed-Transitive>
             <Export-Package>
@@ -64,60 +59,11 @@
               org.apache.tika.parser.iwork.*
             </Export-Package>
             <Import-Package>
-              !org.junit,
-              !org.junit.*,
-              !junit.*,
               *,
-              com.microsoft.schemas.office.powerpoint;resolution:=optional,
-              com.microsoft.schemas.office.word;resolution:=optional,
               org.apache.commons.vfs2;resolution:=optional,
               org.apache.commons.vfs2.provider;resolution:=optional,
               org.apache.commons.vfs2.util;resolution:=optional,
-              org.apache.crimson.jaxp;resolution:=optional,
-              org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
-              org.apache.xml.resolver;resolution:=optional,
-              org.apache.xml.resolver.tools;resolution:=optional,
-              org.apache.xml.security;resolution:=optional,
-              org.apache.xml.security.c14n;resolution:=optional,
-              org.apache.xml.security.utils;resolution:=optional,
-              org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
-              org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
-              com.sun.javadoc;resolution:=optional,
-              com.sun.xml.bind.marshaller;resolution:=optional,
-              com.sun.xml.internal.bind.marshaller;resolution:=optional,
-              com.sun.msv.datatype;resolution:=optional,
-              com.sun.msv.datatype.xsd;resolution:=optional,
-              com.sun.tools.javadoc;resolution:=optional,
-              org.apache.poi.hdgf.extractor;resolution:=optional,
-              org.apache.poi.hpbf.extractor;resolution:=optional,
-              org.apache.poi.hslf.blip;resolution:=optional,
-              org.apache.poi.hslf.extractor;resolution:=optional,
-              org.apache.poi.hsmf;resolution:=optional,
-              org.apache.poi.hsmf.datatypes;resolution:=optional,
-              org.apache.poi.hsmf.extractor;resolution:=optional,
-              org.apache.poi.hwpf;resolution:=optional,
-              org.apache.poi.hwpf.extractor;resolution:=optional,
-              org.apache.tools.ant;resolution:=optional,
-              org.apache.tools.ant.taskdefs;resolution:=optional,
-              org.apache.tools.ant.types;resolution:=optional,
-              org.bouncycastle.asn1;resolution:=optional,
-              org.bouncycastle.asn1.cmp;resolution:=optional,
-              org.bouncycastle.asn1.nist;resolution:=optional,
-              org.bouncycastle.asn1.ocsp;resolution:=optional,
-              org.bouncycastle.asn1.x500;resolution:=optional,
-              org.bouncycastle.asn1.x509;resolution:=optional,
-              org.bouncycastle.cert;resolution:=optional,
-              org.bouncycastle.cert.jcajce;resolution:=optional,
-              org.bouncycastle.cert.ocsp;resolution:=optional,
-              org.bouncycastle.cms;resolution:=optional,
-              org.bouncycastle.cms.bc;resolution:=optional,
-              org.bouncycastle.operator;resolution:=optional,
-              org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional,
-              org.bouncycastle.util;resolution:=optional,
-              org.etsi.uri.x01903.v14;resolution:=optional,
-              org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
-              org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+              
             </Import-Package>
           </instructions>
         </configuration>

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-database-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/pom.xml b/tika-parser-modules/tika-parser-database-module/pom.xml
index fd47f3d..a60dae3 100644
--- a/tika-parser-modules/tika-parser-database-module/pom.xml
+++ b/tika-parser-modules/tika-parser-database-module/pom.xml
@@ -47,6 +47,12 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/pom.xml b/tika-parser-modules/tika-parser-office-module/pom.xml
index 3a8e5d2..689c133 100644
--- a/tika-parser-modules/tika-parser-office-module/pom.xml
+++ b/tika-parser-modules/tika-parser-office-module/pom.xml
@@ -30,6 +30,11 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi</artifactId>
       <version>${poi.version}</version>
@@ -73,11 +78,13 @@
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-package-module</artifactId>
       <version>${project.version}</version>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-web-module</artifactId>
       <version>${project.version}</version>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
index 7c43995..c3e85c1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
@@ -29,9 +29,10 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
 import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.parser.html.HtmlParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -49,6 +50,11 @@ public class ChmParser extends AbstractParser {
                     MediaType.application("chm"),
                     MediaType.application("x-chm"))));
 
+    private final Parser htmlProxy;
+    
+    public ChmParser() {
+        this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
+    }
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -91,12 +97,11 @@ public class ChmParser extends AbstractParser {
     private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
         InputStream stream = null;
         Metadata metadata = new Metadata();
-        HtmlParser htmlParser = new HtmlParser();
         ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
         ParseContext parser = new ParseContext();
         try {
             stream = new ByteArrayInputStream(byteObject);
-            htmlParser.parse(stream, handler, metadata, parser);
+            htmlProxy.parse(stream, handler, metadata, parser);
         } catch (SAXException e) {
             throw new RuntimeException(e);
         } catch (IOException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index e2acb52..320cc74 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -31,6 +31,7 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.hpsf.ClassID;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.DetectorProxy;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
@@ -43,7 +44,6 @@ import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
-import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -55,6 +55,7 @@ abstract class AbstractPOIFSExtractor {
     private MimeTypes mimeTypes;
     private Detector detector;
     private Metadata metadata;
+    private final Detector zipDetectorProxy;
 
     protected AbstractPOIFSExtractor(ParseContext context) {
         this(context, null);
@@ -74,6 +75,7 @@ abstract class AbstractPOIFSExtractor {
         this.mimeTypes = context.get(MimeTypes.class);
         this.detector = context.get(Detector.class);
         this.metadata = metadata;
+        this.zipDetectorProxy = new DetectorProxy("org.apache.tika.parser.pkg.ZipContainerDetector", getClass().getClassLoader());
     }
 
     // Note - these cache, but avoid creating the default TikaConfig if not needed
@@ -159,8 +161,7 @@ abstract class AbstractPOIFSExtractor {
 
             try (TikaInputStream stream = TikaInputStream.get(
                     new DocumentInputStream((DocumentEntry) ooxml))) {
-                ZipContainerDetector detector = new ZipContainerDetector();
-                MediaType type = detector.detect(stream, new Metadata());
+                MediaType type = zipDetectorProxy.detect(stream, new Metadata());
                 handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
                 return;
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index e224d54..345dd24 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -47,7 +47,8 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -71,10 +72,11 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
     final NumberFormat currencyFormatter;
     final DateFormat shortDateTimeFormatter;
 
-    final HtmlParser htmlParser = new HtmlParser();
+    private final Parser htmlParserProxy;
 
     protected JackcessExtractor(ParseContext context, Locale locale) {
         super(context);
+        this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader());
         currencyFormatter = NumberFormat.getCurrencyInstance(locale);
         shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale);
     }
@@ -200,7 +202,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
                 Metadata m = new Metadata();
                 m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
                 try {
-                    htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
+                    htmlParserProxy.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
                             h,
                            m, EMPTY_PARSE_CONTEXT);
                     handler.characters(h.toString());

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 3a85882..108d5eb 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -23,6 +23,7 @@ import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.List;
 import java.util.Locale;
@@ -44,14 +45,15 @@ import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingDetectorProxy;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.html.HtmlEncodingDetector;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
 import org.apache.tika.parser.rtf.RTFParser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
@@ -67,7 +69,9 @@ import static java.nio.charset.StandardCharsets.UTF_8;
  */
 public class OutlookExtractor extends AbstractPOIFSExtractor {
     private static final Metadata EMPTY_METADATA = new Metadata();
-    HtmlEncodingDetector detector = new HtmlEncodingDetector();
+    private final SimpleDateFormat dateFormat;
+    private final EncodingDetector htmlEncodingDetectorProxy;
+    private final Parser htmlParserProxy;
 
     private final MAPIMessage msg;
 
@@ -77,7 +81,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
 
     public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
         super(context);
-
+        this.htmlEncodingDetectorProxy = new EncodingDetectorProxy("org.apache.tika.parser.html.HtmlEncodingDetector", getClass().getClassLoader());
+        this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader());
+        this.dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
         try {
             this.msg = new MAPIMessage(root);
         } catch (IOException e) {
@@ -135,7 +141,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
 
                                 // See if we can parse it as a normal mail date
                                 try {
-                                    Date d = MboxParser.parseDate(date);
+                                    
+                                    Date d = dateFormat.parse(date);
                                     metadata.set(TikaCoreProperties.CREATED, d);
                                     metadata.set(TikaCoreProperties.MODIFIED, d);
                                 } catch (ParseException e) {
@@ -196,8 +203,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
                     data = ((StringChunk) htmlChunk).getRawValue();
                 }
                 if (data != null) {
-                    HtmlParser htmlParser = new HtmlParser();
-                    htmlParser.parse(
+                    htmlParserProxy.parse(
                             new ByteArrayInputStream(data),
                             new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                             new Metadata(), new ParseContext()
@@ -341,7 +347,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
             if(html != null && html.length() > 0) {
                 Charset charset = null;
                 try {
-                    charset = detector.detect(new ByteArrayInputStream(
+                    charset = htmlEncodingDetectorProxy.detect(new ByteArrayInputStream(
                             html.getBytes(UTF_8)), EMPTY_METADATA);
                 } catch (IOException e) {
                     //swallow

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index add1f2c..518a000 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -39,7 +39,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.parser.opc.OPCDetector;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -74,7 +74,7 @@ public class OOXMLExtractorFactory {
             }
 
             // Get the type, and ensure it's one we handle
-            MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+            MediaType type = OPCDetector.detectOfficeOpenXML(pkg);
             if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
                 // Not a supported type, delegate to Empty Parser
                 EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
new file mode 100644
index 0000000..cc17459
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector that detects OPC Packages
+ *
+ */
+public class OPCDetector implements Detector {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = -3569622763024617244L;
+    
+    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String VISIO_DOCUMENT =
+            "http://schemas.microsoft.com/visio/2010/relationships/document";
+    
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String STRICT_CORE_DOCUMENT = 
+            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+    
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream stream = TikaInputStream.get(input, tmp);
+            // Use POI to open and investigate it for us
+            OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+            stream.setOpenContainer(pkg);
+    
+            // Is at an OOXML format?
+            MediaType type = detectOfficeOpenXML(pkg);
+            if (type != null) return type;
+            
+            // Is it XPS format?
+            type = detectXPSOPC(pkg);
+            if (type != null) return type;
+            
+            // Is it an AutoCAD format?
+            type = detectAutoCADOPC(pkg);
+            
+            return type;
+        } catch (InvalidFormatException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }finally {
+            tmp.close();
+        }
+        return null;
+    }
+    
+    /**
+     * Detects the type of an OfficeOpenXML (OOXML) file from
+     *  opened Package 
+     */
+    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+        // Check for the normal Office core document
+        PackageRelationshipCollection core = 
+               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+        // Otherwise check for some other Office core document types
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+        }
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+        }
+        
+        // If we didn't find a single core document of any type, skip detection
+        if (core.size() != 1) {
+            // Invalid OOXML Package received
+            return null;
+        }
+
+        // Get the type of the core document part
+        PackagePart corePart = pkg.getPart(core.getRelationship(0));
+        String coreType = corePart.getContentType();
+
+        // Turn that into the type of the overall document
+        String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+        // The Macro Enabled formats are a little special
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+            docType = docType.toLowerCase(Locale.ROOT) + ".12";
+        }
+
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+        }
+
+        // Build the MediaType object and return
+        return MediaType.parse(docType);
+    }
+    /**
+     * Detects Open XML Paper Specification (XPS)
+     */
+    private static MediaType detectXPSOPC(OPCPackage pkg) {
+        PackageRelationshipCollection xps = 
+                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+        if (xps.size() == 1) {
+            return MediaType.application("vnd.ms-xpsdocument");
+        } else {
+            // Non-XPS Package received
+            return null;
+        }
+    }
+    /**
+     * Detects AutoCAD formats that live in OPC packaging
+     */
+    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+        PackageRelationshipCollection dwfxSeq = 
+                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+        if (dwfxSeq.size() == 1) {
+            return MediaType.parse("model/vnd.dwfx+xps");
+        } else {
+            // Non-AutoCAD Package received
+            return null;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml
index 2e226d2..8d1238d 100644
--- a/tika-parser-modules/tika-parser-package-module/pom.xml
+++ b/tika-parser-modules/tika-parser-package-module/pom.xml
@@ -35,21 +35,6 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi-ooxml</artifactId>
-      <version>${poi.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>stax</groupId>
-          <artifactId>stax-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>xml-apis</groupId>
-          <artifactId>xml-apis</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
       <groupId>org.tukaani</groupId>
       <artifactId>xz</artifactId>
       <version>${tukaani.version}</version>
@@ -69,13 +54,17 @@
       <artifactId>commons-compress</artifactId>
       <version>${commons.compress.version}</version>
     </dependency>
-    <dependency>
+    <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-text-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
-
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
+    </dependency>
   </dependencies>
 
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 9ca6729..8276e9a 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.Locale;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -36,12 +35,7 @@ import org.apache.commons.compress.compressors.CompressorException;
 import org.apache.commons.compress.compressors.CompressorInputStream;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.commons.io.IOUtils;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.AbstractDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
@@ -57,18 +51,16 @@ import static java.nio.charset.StandardCharsets.UTF_8;
  * A detector that works on Zip documents and other archive and compression
  * formats to figure out exactly what the file is.
  */
-public class ZipContainerDetector implements Detector {
-    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+public class ZipContainerDetector extends AbstractDetector {
 
-    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
-    private static final String VISIO_DOCUMENT =
-            "http://schemas.microsoft.com/visio/2010/relationships/document";
-    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
-    private static final String STRICT_CORE_DOCUMENT = 
-            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
-    
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
+    
+    private final Detector opcDetector;
+    
+    public ZipContainerDetector() {
+        this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
+    }
 
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
@@ -138,7 +130,7 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
-    private static MediaType detectZipFormat(TikaInputStream tis) {
+    private MediaType detectZipFormat(TikaInputStream tis) {
         try {
             ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
             try {
@@ -199,24 +191,11 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
-    private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+    private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
         try {
             if (zip.getEntry("_rels/.rels") != null
                     || zip.getEntry("[Content_Types].xml") != null) {
-                // Use POI to open and investigate it for us
-                OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
-                stream.setOpenContainer(pkg);
-
-                // Is at an OOXML format?
-                MediaType type = detectOfficeOpenXML(pkg);
-                if (type != null) return type;
-                
-                // Is it XPS format?
-                type = detectXPSOPC(pkg);
-                if (type != null) return type;
-                
-                // Is it an AutoCAD format?
-                type = detectAutoCADOPC(pkg);
+                MediaType type = this.opcDetector.detect(stream, null);
                 if (type != null) return type;
                 
                 // We don't know what it is, sorry
@@ -228,77 +207,9 @@ public class ZipContainerDetector implements Detector {
             return null;
         } catch (RuntimeException e) {
             return null;
-        } catch (InvalidFormatException e) {
-            return null;
-        }
-    }
-    /**
-     * Detects the type of an OfficeOpenXML (OOXML) file from
-     *  opened Package 
-     */
-    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
-        // Check for the normal Office core document
-        PackageRelationshipCollection core = 
-               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
-        // Otherwise check for some other Office core document types
-        if (core.size() == 0) {
-            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
-        }
-        if (core.size() == 0) {
-            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
-        }
-        
-        // If we didn't find a single core document of any type, skip detection
-        if (core.size() != 1) {
-            // Invalid OOXML Package received
-            return null;
-        }
-
-        // Get the type of the core document part
-        PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        String coreType = corePart.getContentType();
-
-        // Turn that into the type of the overall document
-        String docType = coreType.substring(0, coreType.lastIndexOf('.'));
-
-        // The Macro Enabled formats are a little special
-        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
-            docType = docType.toLowerCase(Locale.ROOT) + ".12";
-        }
-
-        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
-            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
-        }
-
-        // Build the MediaType object and return
-        return MediaType.parse(docType);
-    }
-    /**
-     * Detects Open XML Paper Specification (XPS)
-     */
-    private static MediaType detectXPSOPC(OPCPackage pkg) {
-        PackageRelationshipCollection xps = 
-                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
-        if (xps.size() == 1) {
-            return MediaType.application("vnd.ms-xpsdocument");
-        } else {
-            // Non-XPS Package received
-            return null;
-        }
-    }
-    /**
-     * Detects AutoCAD formats that live in OPC packaging
-     */
-    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
-        PackageRelationshipCollection dwfxSeq = 
-                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
-        if (dwfxSeq.size() == 1) {
-            return MediaType.parse("model/vnd.dwfx+xps");
-        } else {
-            // Non-AutoCAD Package received
-            return null;
         }
     }
+    
 
     private static MediaType detectIWork(ZipFile zip) {
         if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {