You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/06 15:11:18 UTC
[tika] branch master updated: TIKA-2552 -- upgrade to POI 4.0.0
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 01c53ea TIKA-2552 -- upgrade to POI 4.0.0
01c53ea is described below
commit 01c53eaa6d709972b03735574aa58dc0abcdc51c
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Sep 6 11:11:02 2018 -0400
TIKA-2552 -- upgrade to POI 4.0.0
---
.../org/apache/tika/batch/fs/BatchDriverTest.java | 2 +-
tika-bundle/pom.xml | 2 +
tika-eval/pom.xml | 2 +-
tika-parsers/pom.xml | 2 +-
.../tika/parser/microsoft/ExcelExtractor.java | 6 +-
.../tika/parser/microsoft/HSLFExtractor.java | 37 +-
.../parser/microsoft/JackcessCompoundOleUtil.java | 268 +++++++
.../tika/parser/microsoft/JackcessExtractor.java | 28 +-
.../tika/parser/microsoft/JackcessOleUtil.java | 813 +++++++++++++++++++++
.../apache/tika/parser/microsoft/OfficeParser.java | 21 +-
.../tika/parser/microsoft/OutlookExtractor.java | 10 +-
.../parser/microsoft/POIFSContainerDetector.java | 12 +-
.../tika/parser/microsoft/SummaryExtractor.java | 6 +-
.../tika/parser/microsoft/WordExtractor.java | 6 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 17 +-
.../parser/microsoft/ooxml/MetadataExtractor.java | 68 +-
.../parser/microsoft/ooxml/OOXMLExtractor.java | 9 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 28 +-
.../ooxml/POIXMLTextExtractorDecorator.java | 2 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 22 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 2 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 15 +-
.../microsoft/ooxml/xps/XPSTextExtractor.java | 7 +-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 6 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 8 +-
.../tika/parser/pkg/ZipContainerDetector.java | 2 +-
.../apache/tika/parser/rtf/RTFObjDataParser.java | 11 +-
.../tika/detect/TestContainerAwareDetector.java | 7 +-
.../parser/microsoft/PowerPointParserTest.java | 6 +-
.../apache/tika/server/resource/TikaResource.java | 2 +-
31 files changed, 1264 insertions(+), 165 deletions(-)
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java
index 13e35e6..643e7cb 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java
@@ -115,7 +115,7 @@ public class BatchDriverTest extends FSBatchTestBase {
readFileToString(outputDir.resolve("test2_ok.xml.xml"), UTF_8));
}
- @Test(timeout = 30000)
+ @Test(timeout = 60000)
public void allHeavyHangsTestWithStarvedCrawler() throws Exception {
//this tests that if all consumers are hung and the crawler is
//waiting to add to the queue, there isn't deadlock. The BatchProcess should
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 20dfcf4..cbc6e35 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -282,6 +282,8 @@
org.apache.commons.httpclient.params;resolution:=optional,
org.apache.commons.httpclient.protocol;resolution:=optional,
org.apache.commons.httpclient.util;resolution:=optional,
+ org.apache.commons.math3.exception;resolution:=optional,
+ org.apache.commons.math3.linear;resolution:=optional,
org.apache.commons.vfs2;resolution:=optional,
org.apache.commons.vfs2.provider;resolution:=optional,
org.apache.commons.vfs2.util;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index c1e71f3..8428bb7 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -36,7 +36,7 @@
<properties>
<cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
<lucene.version>7.4.0</lucene.version>
- <poi.version>3.17</poi.version>
+ <poi.version>4.0.0</poi.version>
</properties>
<dependencies>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index f925d4f..5560406 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.17</poi.version>
+ <poi.version>4.0.0</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.11</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index ff5971a..0dd86ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -64,7 +64,7 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -139,7 +139,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
* or writing the extracted content
*/
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml, locale);
}
@@ -273,7 +273,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
* @throws IOException on any IO errors.
* @throws SAXException on any SAX parsing errors.
*/
- public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
+ public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
processFile(filesystem.getRoot(), listenForAllRecords);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 7057cbe..5095709 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -24,9 +24,7 @@ import java.util.List;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.hslf.exceptions.EncryptedPowerPointFileException;
-import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.OLEShape;
import org.apache.poi.hslf.record.DocInfoListContainer;
import org.apache.poi.hslf.record.RecordTypes;
import org.apache.poi.hslf.record.VBAInfoAtom;
@@ -35,6 +33,7 @@ import org.apache.poi.hslf.usermodel.HSLFGroupShape;
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
import org.apache.poi.hslf.usermodel.HSLFNotes;
import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFObjectShape;
import org.apache.poi.hslf.usermodel.HSLFPictureData;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
@@ -46,7 +45,9 @@ import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hslf.usermodel.HSLFTextRun;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.usermodel.Comment;
+import org.apache.poi.sl.usermodel.SimpleShape;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -67,7 +68,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml);
}
@@ -269,9 +270,9 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
long persistId = vbaAtom.getPersistIdRef();
for (HSLFObjectData objData : ppt.getEmbeddedObjects()) {
if (objData.getExOleObjStg().getPersistId() == persistId) {
- try (NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(objData.getData())) {
+ try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem(objData.getInputStream())) {
try {
- OfficeParser.extractMacros(npoifsFileSystem, xhtml,
+ OfficeParser.extractMacros(poifsFileSystem, xhtml,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
} catch (IOException|SAXException inner) {
EmbeddedDocumentUtil.recordException(inner, parentMetadata);
@@ -295,7 +296,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
xhtml.startElement("div", "class", "slide-master-content");
for (HSLFShape shape : shapes) {
- if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+ if (shape != null && ! isPlaceholder(shape)) {
if (shape instanceof HSLFTextShape) {
HSLFTextShape tsh = (HSLFTextShape) shape;
String text = tsh.getText();
@@ -308,6 +309,10 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
xhtml.endElement("div");
}
+ private boolean isPlaceholder(HSLFShape shape) {
+ return shape instanceof SimpleShape && ((SimpleShape)shape).isPlaceholder();
+ }
+
private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
xhtml.startElement("table");
for (int row = 0; row < shape.getNumberOfRows(); row++) {
@@ -449,8 +454,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
for (HSLFShape shape : shapes) {
- if (shape instanceof OLEShape) {
- OLEShape oleShape = (OLEShape) shape;
+ if (shape instanceof HSLFObjectShape) {
+ HSLFObjectShape oleShape = (HSLFObjectShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
@@ -474,14 +479,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
xhtml.endElement("div");
InputStream dataStream = null;
try {
- dataStream = data.getData();
+ dataStream = data.getInputStream();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
- if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+ if ("Excel.Chart.8".equals(oleShape.getProgId())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
@@ -489,18 +494,18 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
|| mediaType.equals("application/x-tika-msoffice")) {
- NPOIFSFileSystem npoifs = null;
+ POIFSFileSystem poifs = null;
try {
- npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+ poifs = new POIFSFileSystem(new CloseShieldInputStream(stream));
} catch (RuntimeException e) {
throw new IOExceptionWithCause(e);
}
try {
- handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+ handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
} finally {
- if (npoifs != null) {
- npoifs.close();
+ if (poifs != null) {
+ poifs.close();
}
}
} else {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
new file mode 100644
index 0000000..b09f19d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
@@ -0,0 +1,268 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import com.healthmarketscience.jackcess.RuntimeIOException;
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.util.MemFileChannel;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.commons.lang.builder.ToStringBuilder;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0.
+ * This class will be removed once POI 4.0.0 is released and jackcess
+ * updates to the most recent version of POI.
+ * @deprecated -- this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory {
+ private static final String ENTRY_NAME_CHARSET = "UTF-8";
+ private static final String ENTRY_SEPARATOR = "/";
+ private static final String CONTENTS_ENTRY = "CONTENTS";
+
+ static {
+ // force a poi class to be loaded to ensure that when this class is
+ // loaded, we know that the poi classes are available
+ POIFSFileSystem.class.getName();
+ }
+
+ public JackcessCompoundOleUtil() {
+ }
+
+ /**
+ * Creates a nes CompoundContent for the given blob information.
+ */
+ public JackcessOleUtil.ContentImpl createCompoundPackageContent(
+ JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen) {
+ return new CompoundContentImpl(blob, prettyName, className, typeName,
+ blobBb.position(), dataBlockLen);
+ }
+
+ /**
+ * Gets a DocumentEntry from compound storage based on a fully qualified,
+ * encoded entry name.
+ *
+ * @param entryName fully qualified, encoded entry name
+ * @param dir root directory of the compound storage
+ * @return the relevant DocumentEntry
+ * @throws FileNotFoundException if the entry does not exist
+ * @throws IOException if some other io error occurs
+ */
+ public static DocumentEntry getDocumentEntry(String entryName,
+ DirectoryEntry dir)
+ throws IOException {
+ // split entry name into individual components and decode them
+ List<String> entryNames = new ArrayList<String>();
+ for (String str : entryName.split(ENTRY_SEPARATOR)) {
+ if (str.length() == 0) {
+ continue;
+ }
+ entryNames.add(decodeEntryName(str));
+ }
+
+ DocumentEntry entry = null;
+ Iterator<String> iter = entryNames.iterator();
+ while (iter.hasNext()) {
+ org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next());
+ if (tmpEntry instanceof DirectoryEntry) {
+ dir = (DirectoryEntry) tmpEntry;
+ } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) {
+ entry = (DocumentEntry) tmpEntry;
+ } else {
+ break;
+ }
+ }
+
+ if (entry == null) {
+ throw new FileNotFoundException("Could not find document " + entryName);
+ }
+
+ return entry;
+ }
+
+ private static String encodeEntryName(String name) {
+ try {
+ return URLEncoder.encode(name, ENTRY_NAME_CHARSET);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static String decodeEntryName(String name) {
+ try {
+ return URLDecoder.decode(name, ENTRY_NAME_CHARSET);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static final class CompoundContentImpl
+ extends JackcessOleUtil.EmbeddedPackageContentImpl
+ implements OleBlob.CompoundContent {
+ private POIFSFileSystem _fs;
+
+ private CompoundContentImpl(
+ JackcessOleUtil.OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length) {
+ super(blob, prettyName, className, typeName, position, length);
+ }
+
+ public OleBlob.ContentType getType() {
+ return OleBlob.ContentType.COMPOUND_STORAGE;
+ }
+
+ private POIFSFileSystem getFileSystem() throws IOException {
+ if (_fs == null) {
+ _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r"));
+ }
+ return _fs;
+ }
+
+ public Iterator<Entry> iterator() {
+ try {
+ return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(),
+ ENTRY_SEPARATOR).iterator();
+ } catch (IOException e) {
+ throw new RuntimeIOException(e);
+ }
+ }
+
+ public EntryImpl getEntry(String entryName) throws IOException {
+ return new EntryImpl(entryName,
+ getDocumentEntry(entryName, getFileSystem().getRoot()));
+ }
+
+ public boolean hasContentsEntry() throws IOException {
+ return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY);
+ }
+
+ public EntryImpl getContentsEntry() throws IOException {
+ return getEntry(CONTENTS_ENTRY);
+ }
+
+ private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir,
+ String prefix) {
+ for (org.apache.poi.poifs.filesystem.Entry entry : dir) {
+ if (entry instanceof DirectoryEntry) {
+ // .. recurse into this directory
+ getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR);
+ } else if (entry instanceof DocumentEntry) {
+ // grab the entry name/detils
+ DocumentEntry de = (DocumentEntry) entry;
+ String entryName = prefix + encodeEntryName(entry.getName());
+ entries.add(new EntryImpl(entryName, de));
+ }
+ }
+ return entries;
+ }
+
+ @Override
+ public void close() {
+ ByteUtil.closeQuietly(_fs);
+ _fs = null;
+ super.close();
+ }
+
+ @Override
+ public String toString() {
+ ToStringBuilder sb = toString(CustomToStringStyle.builder(this));
+
+ try {
+ sb.append("hasContentsEntry", hasContentsEntry());
+ sb.append("entries", getEntries(new ArrayList<Entry>(),
+ getFileSystem().getRoot(),
+ ENTRY_SEPARATOR));
+ } catch (IOException e) {
+ sb.append("entries", "<" + e + ">");
+ }
+
+ return sb.toString();
+ }
+
+ private final class EntryImpl implements OleBlob.CompoundContent.Entry {
+ private final String _name;
+ private final DocumentEntry _docEntry;
+
+ private EntryImpl(String name, DocumentEntry docEntry) {
+ _name = name;
+ _docEntry = docEntry;
+ }
+
+ public OleBlob.ContentType getType() {
+ return OleBlob.ContentType.UNKNOWN;
+ }
+
+ public String getName() {
+ return _name;
+ }
+
+ public CompoundContentImpl getParent() {
+ return CompoundContentImpl.this;
+ }
+
+ public JackcessOleUtil.OleBlobImpl getBlob() {
+ return getParent().getBlob();
+ }
+
+ public long length() {
+ return _docEntry.getSize();
+ }
+
+ public InputStream getStream() throws IOException {
+ return new DocumentInputStream(_docEntry);
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ InputStream in = null;
+ try {
+ ByteUtil.copy(in = getStream(), out);
+ } finally {
+ ByteUtil.closeQuietly(in);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return CustomToStringStyle.valueBuilder(this)
+ .append("name", _name)
+ .append("length", length())
+ .toString();
+ }
+ }
+ }
+}
+
+
+
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index bf5c5d0..3a10346 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -41,7 +41,7 @@ import com.healthmarketscience.jackcess.Row;
import com.healthmarketscience.jackcess.Table;
import com.healthmarketscience.jackcess.query.Query;
import com.healthmarketscience.jackcess.util.OleBlob;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.IOUtils;
@@ -302,8 +302,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
}
+
private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
- OleBlob blob = row.getBlob(cName);
+ OleBlob blob = getBlob(row, cName);
//lifted shamelessly from Jackcess's OleBlobTest
if (blob == null)
return;
@@ -367,9 +368,21 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
}
+ /*
+ Temporary work around until POI 4.0.0 is released and jackcess upgrades
+ This is copy/pasted from jackcess
+ */
+ private OleBlob getBlob(Row row, String cName) {
+ byte[] bytes = row.getBytes(cName);
+ if (bytes == null) {
+ return null;
+ }
+ return JackcessOleUtil.parseBlob(bytes);
+ }
+
private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
InputStream is = null;
- NPOIFSFileSystem nfs = null;
+ POIFSFileSystem fileSystem = null;
try {
try {
is = cc.getStream();
@@ -379,18 +392,18 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
try {
- nfs = new NPOIFSFileSystem(is);
+ fileSystem = new POIFSFileSystem(is);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
- handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
} finally {
- if (nfs != null) {
+ if (fileSystem != null) {
try {
- nfs.close();
+ fileSystem.close();
} catch (IOException e) {
//swallow
}
@@ -414,5 +427,6 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
return shortDateTimeFormatter.format(d);
}
+
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
new file mode 100644
index 0000000..a1432d6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
@@ -0,0 +1,813 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.Closeable;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.sql.Blob;
+import java.sql.SQLException;
+import java.sql.SQLFeatureNotSupportedException;
+import java.text.Normalizer;
+import java.util.EnumSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import static com.healthmarketscience.jackcess.util.OleBlob.*;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.impl.PageChannel;
+
+/**
+ * Utility code for working with OLE data.
+ * Temporary workaround until POI 4.0.0 is released and Jackcess is updated
+ *
+ *
+ * @author James Ahlborn
+ * @usage _advanced_class_
+ * @deprecated this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessOleUtil {
+
+
+ /**
+ * Interface used to allow optional inclusion of the poi library for working
+ * with compound ole data.
+ */
+ interface CompoundPackageFactory
+ {
+ public ContentImpl createCompoundPackageContent(
+ OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen);
+ }
+
+ private static final int PACKAGE_SIGNATURE = 0x1C15;
+ private static final Charset OLE_CHARSET = Charset.forName("US-ASCII");
+ private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE");
+ private static final byte[] COMPOUND_STORAGE_SIGNATURE =
+ {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,
+ (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1};
+ private static final String SIMPLE_PACKAGE_TYPE = "Package";
+ private static final int PACKAGE_OBJECT_TYPE = 0x02;
+ private static final int OLE_VERSION = 0x0501;
+ private static final int OLE_FORMAT = 0x02;
+ private static final int PACKAGE_STREAM_SIGNATURE = 0x02;
+ private static final int PS_EMBEDDED_FILE = 0x030000;
+ private static final int PS_LINKED_FILE = 0x010000;
+ private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of(
+ ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER);
+ private static final byte[] NO_DATA = new byte[0];
+ private static final int LINK_HEADER = 0x01;
+ private static final byte[] PACKAGE_FOOTER = {
+ 0x01, 0x05, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
+ };
+
+ // regex pattern which matches all the crazy extra stuff in unicode
+ private static final Pattern UNICODE_ACCENT_PATTERN =
+ Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
+
+ private static final CompoundPackageFactory COMPOUND_FACTORY;
+
+ static {
+ CompoundPackageFactory compoundFactory = null;
+ try {
+ compoundFactory = (CompoundPackageFactory)
+ Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil")
+ .newInstance();
+ } catch(Throwable t) {
+ // must not have poi, will load compound ole data as "other"
+ }
+ COMPOUND_FACTORY = compoundFactory;
+ }
+
+ /**
+ * Parses an access database blob structure and returns an appropriate
+ * OleBlob instance.
+ */
+ public static OleBlob parseBlob(byte[] bytes) {
+ return new OleBlobImpl(bytes);
+ }
+
+ /**
+ * Creates a new OlBlob instance using the given information.
+ */
+ public static OleBlob createBlob(Builder oleBuilder)
+ throws IOException
+ {
+ try {
+
+ if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) {
+ throw new IllegalArgumentException(
+ "Cannot currently create ole values of type " +
+ oleBuilder.getType());
+ }
+
+ long contentLen = oleBuilder.getContentLength();
+ byte[] contentBytes = oleBuilder.getBytes();
+ InputStream contentStream = oleBuilder.getStream();
+ byte[] packageStreamHeader = NO_DATA;
+ byte[] packageStreamFooter = NO_DATA;
+
+ switch(oleBuilder.getType()) {
+ case LINK:
+ packageStreamHeader = writePackageStreamHeader(oleBuilder);
+
+ // link "content" is file path
+ contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
+ contentLen = contentBytes.length;
+ break;
+
+ case SIMPLE_PACKAGE:
+ packageStreamHeader = writePackageStreamHeader(oleBuilder);
+ packageStreamFooter = writePackageStreamFooter(oleBuilder);
+ break;
+
+ case OTHER:
+ // nothing more to do
+ break;
+ default:
+ throw new RuntimeException("unexpected type " + oleBuilder.getType());
+ }
+
+ long payloadLen = packageStreamHeader.length + packageStreamFooter.length +
+ contentLen;
+ byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen);
+
+ long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length +
+ payloadLen;
+ if(totalOleLen > DataType.OLE.getMaxSize()) {
+ throw new IllegalArgumentException("Content size of " + totalOleLen +
+ " is too large for ole column");
+ }
+
+ byte[] oleBytes = new byte[(int)totalOleLen];
+ ByteBuffer bb = PageChannel.wrap(oleBytes);
+ bb.put(packageHeader);
+ bb.put(packageStreamHeader);
+
+ if(contentLen > 0L) {
+ if(contentBytes != null) {
+ bb.put(contentBytes);
+ } else {
+ byte[] buf = new byte[8192];
+ int numBytes = 0;
+ while((numBytes = contentStream.read(buf)) >= 0) {
+ bb.put(buf, 0, numBytes);
+ }
+ }
+ }
+
+ bb.put(packageStreamFooter);
+ bb.put(PACKAGE_FOOTER);
+
+ return parseBlob(oleBytes);
+
+ } finally {
+ ByteUtil.closeQuietly(oleBuilder.getStream());
+ }
+ }
+
+ private static byte[] writePackageHeader(Builder oleBuilder,
+ long contentLen) {
+
+ byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName());
+ String className = oleBuilder.getClassName();
+ String typeName = oleBuilder.getTypeName();
+ if(className == null) {
+ className = typeName;
+ } else if(typeName == null) {
+ typeName = className;
+ }
+ byte[] classNameBytes = getZeroTermStrBytes(className);
+ byte[] typeNameBytes = getZeroTermStrBytes(typeName);
+
+ int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length;
+
+ int oleHeaderLen = 24 + typeNameBytes.length;
+
+ byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen];
+
+ ByteBuffer bb = PageChannel.wrap(headerBytes);
+
+ // write outer package header
+ bb.putShort((short)PACKAGE_SIGNATURE);
+ bb.putShort((short)packageHeaderLen);
+ bb.putInt(PACKAGE_OBJECT_TYPE);
+ bb.putShort((short)prettyNameBytes.length);
+ bb.putShort((short)classNameBytes.length);
+ int prettyNameOff = bb.position() + 8;
+ bb.putShort((short)prettyNameOff);
+ bb.putShort((short)(prettyNameOff + prettyNameBytes.length));
+ bb.putInt(-1);
+ bb.put(prettyNameBytes);
+ bb.put(classNameBytes);
+
+ // put ole header
+ bb.putInt(OLE_VERSION);
+ bb.putInt(OLE_FORMAT);
+ bb.putInt(typeNameBytes.length);
+ bb.put(typeNameBytes);
+ bb.putLong(0L);
+ bb.putInt((int)contentLen);
+
+ return headerBytes;
+ }
+
+ private static byte[] writePackageStreamHeader(Builder oleBuilder) {
+
+ byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName());
+ byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
+
+ int headerLen = 6 + fileNameBytes.length + filePathBytes.length;
+
+ if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+
+ headerLen += 8 + filePathBytes.length;
+
+ } else {
+
+ headerLen += 2;
+ }
+
+ byte[] headerBytes = new byte[headerLen];
+ ByteBuffer bb = PageChannel.wrap(headerBytes);
+ bb.putShort((short)PACKAGE_STREAM_SIGNATURE);
+ bb.put(fileNameBytes);
+ bb.put(filePathBytes);
+
+ if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+ bb.putInt(PS_EMBEDDED_FILE);
+ bb.putInt(filePathBytes.length);
+ bb.put(filePathBytes, 0, filePathBytes.length);
+ bb.putInt((int) oleBuilder.getContentLength());
+ } else {
+ bb.putInt(PS_LINKED_FILE);
+ bb.putShort((short)LINK_HEADER);
+ }
+
+ return headerBytes;
+ }
+
+ private static byte[] writePackageStreamFooter(Builder oleBuilder) {
+
+ // note, these are _not_ zero terminated
+ byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET);
+ byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET);
+
+ int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length;
+
+ byte[] footerBytes = new byte[footerLen];
+ ByteBuffer bb = PageChannel.wrap(footerBytes);
+
+ bb.putInt(filePathBytes.length/2);
+ bb.put(filePathBytes);
+ bb.putInt(fileNameBytes.length/2);
+ bb.put(fileNameBytes);
+ bb.putInt(filePathBytes.length/2);
+ bb.put(filePathBytes);
+
+ return footerBytes;
+ }
+
+ /**
+ * creates the appropriate ContentImpl for the given blob.
+ */
+ private static ContentImpl parseContent(OleBlobImpl blob)
+ throws IOException
+ {
+ ByteBuffer bb = PageChannel.wrap(blob.getBytes());
+
+ if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) {
+ return new UnknownContentImpl(blob);
+ }
+
+ // read outer package header
+ int headerSize = bb.getShort();
+ /* int objType = */ bb.getInt();
+ int prettyNameLen = bb.getShort();
+ int classNameLen = bb.getShort();
+ int prettyNameOff = bb.getShort();
+ int classNameOff = bb.getShort();
+ /* int objSize = */ bb.getInt();
+ String prettyName = readStr(bb, prettyNameOff, prettyNameLen);
+ String className = readStr(bb, classNameOff, classNameLen);
+ bb.position(headerSize);
+
+ // read ole header
+ int oleVer = bb.getInt();
+ /* int format = */ bb.getInt();
+
+ if(oleVer != OLE_VERSION) {
+ return new UnknownContentImpl(blob);
+ }
+
+ int typeNameLen = bb.getInt();
+ String typeName = readStr(bb, bb.position(), typeNameLen);
+ bb.getLong(); // unused
+ int dataBlockLen = bb.getInt();
+ int dataBlockPos = bb.position();
+
+
+ if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) {
+ return createSimplePackageContent(
+ blob, prettyName, className, typeName, bb, dataBlockLen);
+ }
+
+ // if COMPOUND_FACTORY is null, the poi library isn't available, so just
+ // load compound data as "other"
+ if((COMPOUND_FACTORY != null) &&
+ (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) &&
+ ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) {
+ return COMPOUND_FACTORY.createCompoundPackageContent(
+ blob, prettyName, className, typeName, bb, dataBlockLen);
+ }
+
+ // this is either some other "special" (as yet unhandled) format, or it is
+ // simply an embedded file (or it is compound data and poi isn't available)
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ private static ContentImpl createSimplePackageContent(
+ OleBlobImpl blob, String prettyName, String className, String typeName,
+ ByteBuffer blobBb, int dataBlockLen) {
+
+ int dataBlockPos = blobBb.position();
+ ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos,
+ dataBlockPos + dataBlockLen);
+
+ int packageSig = bb.getShort();
+ if(packageSig != PACKAGE_STREAM_SIGNATURE) {
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ String fileName = readZeroTermStr(bb);
+ String filePath = readZeroTermStr(bb);
+ int packageType = bb.getInt();
+
+ if(packageType == PS_EMBEDDED_FILE) {
+
+ int localFilePathLen = bb.getInt();
+ String localFilePath = readStr(bb, bb.position(), localFilePathLen);
+ int dataLen = bb.getInt();
+ int dataPos = bb.position();
+ bb.position(dataLen + dataPos);
+
+ // remaining strings are in "reverse" order (local file path, file name,
+ // file path). these string usee a real utf charset, and therefore can
+ // "fix" problems with ascii based names (so we prefer these strings to
+ // the original strings we found)
+ int strNum = 0;
+ while(true) {
+
+ int rem = bb.remaining();
+ if(rem < 4) {
+ break;
+ }
+
+ int strLen = bb.getInt();
+ String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET);
+
+ switch(strNum) {
+ case 0:
+ localFilePath = remStr;
+ break;
+ case 1:
+ fileName = remStr;
+ break;
+ case 2:
+ filePath = remStr;
+ break;
+ default:
+ // ignore
+ }
+
+ ++strNum;
+ }
+
+ return new SimplePackageContentImpl(
+ blob, prettyName, className, typeName, dataPos, dataLen,
+ fileName, filePath, localFilePath);
+ }
+
+ if(packageType == PS_LINKED_FILE) {
+
+ bb.getShort(); //unknown
+ String linkStr = readZeroTermStr(bb);
+
+ return new LinkContentImpl(blob, prettyName, className, typeName,
+ fileName, linkStr, filePath);
+ }
+
+ return new OtherContentImpl(blob, prettyName, className,
+ typeName, dataBlockPos, dataBlockLen);
+ }
+
+ private static String readStr(ByteBuffer bb, int off, int len) {
+ return readStr(bb, off, len, OLE_CHARSET);
+ }
+
+ private static String readZeroTermStr(ByteBuffer bb) {
+ int off = bb.position();
+ while(bb.hasRemaining()) {
+ byte b = bb.get();
+ if(b == 0) {
+ break;
+ }
+ }
+ int len = bb.position() - off;
+ return readStr(bb, off, len);
+ }
+
+ private static String readStr(ByteBuffer bb, int off, int len,
+ Charset charset) {
+ String str = new String(bb.array(), off, len, charset);
+ bb.position(off + len);
+ if(str.charAt(str.length() - 1) == '\0') {
+ str = str.substring(0, str.length() - 1);
+ }
+ return str;
+ }
+
+ private static byte[] getZeroTermStrBytes(String str) {
+ // since we are converting to ascii, try to make "nicer" versions of crazy
+ // chars (e.g. convert "u with an umlaut" to just "u"). this may not
+ // ultimately help anything but it is what ms access does.
+
+ // decompose complex chars into combos of char and accent
+ str = Normalizer.normalize(str, Normalizer.Form.NFD);
+ // strip the accents
+ str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
+ // (re)normalize what is left
+ str = Normalizer.normalize(str, Normalizer.Form.NFC);
+
+ return (str + '\0').getBytes(OLE_CHARSET);
+ }
+
+
+ static final class OleBlobImpl implements OleBlob
+ {
+ private byte[] _bytes;
+ private ContentImpl _content;
+
+ private OleBlobImpl(byte[] bytes) {
+ _bytes = bytes;
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ out.write(_bytes);
+ }
+
+ public Content getContent() throws IOException {
+ if(_content == null) {
+ _content = parseContent(this);
+ }
+ return _content;
+ }
+
+ public InputStream getBinaryStream() throws SQLException {
+ return new ByteArrayInputStream(_bytes);
+ }
+
+ public InputStream getBinaryStream(long pos, long len)
+ throws SQLException
+ {
+ return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len);
+ }
+
+ public long length() throws SQLException {
+ return _bytes.length;
+ }
+
+ public byte[] getBytes() throws IOException {
+ if(_bytes == null) {
+ throw new IOException("blob is closed");
+ }
+ return _bytes;
+ }
+
+ public byte[] getBytes(long pos, int len) throws SQLException {
+ return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len);
+ }
+
+ public long position(byte[] pattern, long start) throws SQLException {
+ int pos = ByteUtil.findRange(PageChannel.wrap(_bytes),
+ fromJdbcOffset(start), pattern);
+ return((pos >= 0) ? toJdbcOffset(pos) : pos);
+ }
+
+ public long position(Blob pattern, long start) throws SQLException {
+ return position(pattern.getBytes(1L, (int)pattern.length()), start);
+ }
+
+ public OutputStream setBinaryStream(long position) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public void truncate(long len) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public int setBytes(long pos, byte[] bytes) throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public int setBytes(long pos, byte[] bytes, int offset, int lesn)
+ throws SQLException {
+ throw new SQLFeatureNotSupportedException();
+ }
+
+ public void free() {
+ close();
+ }
+
+ public void close() {
+ _bytes = null;
+ ByteUtil.closeQuietly(_content);
+ _content = null;
+ }
+
+ private static int toJdbcOffset(int off) {
+ return off + 1;
+ }
+
+ private static int fromJdbcOffset(long off) {
+ return (int)off - 1;
+ }
+
+ @Override
+ public String toString() {
+ ToStringBuilder sb = CustomToStringStyle.builder(this);
+ if(_content != null) {
+ sb.append("content", _content);
+ } else {
+ sb.append("bytes", _bytes);
+ sb.append("content", "(uninitialized)");
+ }
+ return sb.toString();
+ }
+ }
+
+ static abstract class ContentImpl implements Content, Closeable
+ {
+ protected final OleBlobImpl _blob;
+
+ protected ContentImpl(OleBlobImpl blob) {
+ _blob = blob;
+ }
+
+ public OleBlobImpl getBlob() {
+ return _blob;
+ }
+
+ protected byte[] getBytes() throws IOException {
+ return getBlob().getBytes();
+ }
+
+ public void close() {
+ // base does nothing
+ }
+
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ sb.append("type", getType());
+ return sb;
+ }
+ }
+
+ static abstract class EmbeddedContentImpl extends ContentImpl
+ implements EmbeddedContent
+ {
+ private final int _position;
+ private final int _length;
+
+ protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length)
+ {
+ super(blob);
+ _position = position;
+ _length = length;
+ }
+
+ public long length() {
+ return _length;
+ }
+
+ public InputStream getStream() throws IOException {
+ return new ByteArrayInputStream(getBytes(), _position, _length);
+ }
+
+ public void writeTo(OutputStream out) throws IOException {
+ out.write(getBytes(), _position, _length);
+ }
+
+ @Override
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ super.toString(sb);
+ if(_position >= 0) {
+ sb.append("content", ByteBuffer.wrap(_blob._bytes, _position, _length));
+ }
+ return sb;
+ }
+ }
+
+ static abstract class EmbeddedPackageContentImpl
+ extends EmbeddedContentImpl
+ implements PackageContent
+ {
+ private final String _prettyName;
+ private final String _className;
+ private final String _typeName;
+
+ protected EmbeddedPackageContentImpl(
+ OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length)
+ {
+ super(blob, position, length);
+ _prettyName = prettyName;
+ _className = className;
+ _typeName = typeName;
+ }
+
+ public String getPrettyName() {
+ return _prettyName;
+ }
+
+ public String getClassName() {
+ return _className;
+ }
+
+ public String getTypeName() {
+ return _typeName;
+ }
+
+ @Override
+ protected ToStringBuilder toString(ToStringBuilder sb) {
+ sb.append("prettyName", _prettyName)
+ .append("className", _className)
+ .append("typeName", _typeName);
+ super.toString(sb);
+ return sb;
+ }
+ }
+
+ private static final class LinkContentImpl
+ extends EmbeddedPackageContentImpl
+ implements LinkContent
+ {
+ private final String _fileName;
+ private final String _linkPath;
+ private final String _filePath;
+
+ private LinkContentImpl(OleBlobImpl blob, String prettyName,
+ String className, String typeName,
+ String fileName, String linkPath,
+ String filePath)
+ {
+ super(blob, prettyName, className, typeName, -1, -1);
+ _fileName = fileName;
+ _linkPath = linkPath;
+ _filePath = filePath;
+ }
+
+ public ContentType getType() {
+ return ContentType.LINK;
+ }
+
+ public String getFileName() {
+ return _fileName;
+ }
+
+ public String getLinkPath() {
+ return _linkPath;
+ }
+
+ public String getFilePath() {
+ return _filePath;
+ }
+
+ public InputStream getLinkStream() throws IOException {
+ return new FileInputStream(getLinkPath());
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("fileName", _fileName)
+ .append("linkPath", _linkPath)
+ .append("filePath", _filePath)
+ .toString();
+ }
+ }
+
+ private static final class SimplePackageContentImpl
+ extends EmbeddedPackageContentImpl
+ implements SimplePackageContent
+ {
+ private final String _fileName;
+ private final String _filePath;
+ private final String _localFilePath;
+
+ private SimplePackageContentImpl(OleBlobImpl blob, String prettyName,
+ String className, String typeName,
+ int position, int length,
+ String fileName, String filePath,
+ String localFilePath)
+ {
+ super(blob, prettyName, className, typeName, position, length);
+ _fileName = fileName;
+ _filePath = filePath;
+ _localFilePath = localFilePath;
+ }
+
+ public ContentType getType() {
+ return ContentType.SIMPLE_PACKAGE;
+ }
+
+ public String getFileName() {
+ return _fileName;
+ }
+
+ public String getFilePath() {
+ return _filePath;
+ }
+
+ public String getLocalFilePath() {
+ return _localFilePath;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("fileName", _fileName)
+ .append("filePath", _filePath)
+ .append("localFilePath", _localFilePath)
+ .toString();
+ }
+ }
+
+ private static final class OtherContentImpl
+ extends EmbeddedPackageContentImpl
+ implements OtherContent
+ {
+ private OtherContentImpl(
+ OleBlobImpl blob, String prettyName, String className,
+ String typeName, int position, int length)
+ {
+ super(blob, prettyName, className, typeName, position, length);
+ }
+
+ public ContentType getType() {
+ return ContentType.OTHER;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .toString();
+ }
+ }
+
+ private static final class UnknownContentImpl extends ContentImpl
+ {
+ private UnknownContentImpl(OleBlobImpl blob) {
+ super(blob);
+ }
+
+ public ContentType getType() {
+ return ContentType.UNKNOWN;
+ }
+
+ @Override
+ public String toString() {
+ return toString(CustomToStringStyle.builder(this))
+ .append("content", _blob._bytes)
+ .toString();
+ }
+ }
+
+ }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index e418dfe..517db05 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -36,7 +36,6 @@ import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
@@ -105,23 +104,23 @@ public class OfficeParser extends AbstractOfficeParser {
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
- NPOIFSFileSystem mustCloseFs = null;
+ POIFSFileSystem mustCloseFs = null;
try {
if (tstream == null) {
- mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+ mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream));
root = mustCloseFs.getRoot();
} else {
final Object container = tstream.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- root = ((NPOIFSFileSystem) container).getRoot();
+ if (container instanceof POIFSFileSystem) {
+ root = ((POIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
} else {
- NPOIFSFileSystem fs = null;
+ POIFSFileSystem fs = null;
if (tstream.hasFile()) {
- fs = new NPOIFSFileSystem(tstream.getFile(), true);
+ fs = new POIFSFileSystem(tstream.getFile(), true);
} else {
- fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ fs = new POIFSFileSystem(new CloseShieldInputStream(tstream));
}
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
@@ -274,10 +273,6 @@ public class OfficeParser extends AbstractOfficeParser {
return detectType(fs.getRoot());
}
- public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
- return detectType(fs.getRoot());
- }
-
public static POIFSDocumentType detectType(DirectoryEntry node) {
Set<String> names = new HashSet<String>();
for (Entry entry : node) {
@@ -313,7 +308,7 @@ public class OfficeParser extends AbstractOfficeParser {
* @throws IOException on IOException if it occurs during the extraction of the embedded doc
* @throws SAXException on SAXException for writing to xhtml
*/
- public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml,
+ public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException {
VBAMacroReader reader = null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a9a6090..136bbf2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -51,9 +51,8 @@ import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.CodePageUtil;
-import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
@@ -74,7 +73,6 @@ import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.bouncycastle.cms.Recipient;
import org.xml.sax.SAXException;
/**
@@ -128,7 +126,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
private final boolean extractAllAlternatives;
- public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+ public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException {
this(filesystem.getRoot(), context);
}
@@ -149,7 +147,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
msg.setReturnNullOnMissingChunk(true);
try {
- metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
+ metadata.set(Office.MAPI_MESSAGE_CLASS, msg.getMessageClassEnum().name());
} catch (ChunkNotFoundException e){}
// If the message contains strings that aren't stored
@@ -485,7 +483,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
metadata.add(property, chunks.get(0).toString());
}
- //TODO: replace this with getMessageClassEnum when we upgrade POI
+ //Still needed by PSTParser
public static String getMessageClass(String messageClass){
if (messageClass == null || messageClass.trim().length() == 0) {
return "UNSPECIFIED";
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 1c98690..1b5a0a9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -385,7 +385,7 @@ public class POIFSContainerDetector implements Detector {
File file = stream.getFile();
try {
- NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+ POIFSFileSystem fs = new POIFSFileSystem(file, true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
@@ -423,8 +423,8 @@ public class POIFSContainerDetector implements Detector {
Set<String> names = null;
if (tis != null) {
Object container = tis.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ if (container instanceof POIFSFileSystem) {
+ names = getTopLevelNames(((POIFSFileSystem) container).getRoot());
} else if (container instanceof DirectoryNode) {
names = getTopLevelNames((DirectoryNode) container);
}
@@ -454,8 +454,8 @@ public class POIFSContainerDetector implements Detector {
// Detect based on the names (as available)
if (tis != null &&
tis.getOpenContainer() != null &&
- tis.getOpenContainer() instanceof NPOIFSFileSystem) {
- return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+ tis.getOpenContainer() instanceof POIFSFileSystem) {
+ return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index 46d5591..3019731 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -32,7 +32,7 @@ import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -62,7 +62,7 @@ public class SummaryExtractor {
this.metadata = metadata;
}
- public void parseSummaries(NPOIFSFileSystem filesystem)
+ public void parseSummaries(POIFSFileSystem filesystem)
throws IOException, TikaException {
parseSummaries(filesystem.getRoot());
}
@@ -93,8 +93,6 @@ public class SummaryExtractor {
// no property stream, just skip it
} catch (UnexpectedPropertySetTypeException e) {
throw new TikaException("Unexpected HPSF document", e);
- } catch (MarkUnsupportedException e) {
- throw new TikaException("Invalid DocumentInputStream", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4a80420..30bd4bb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -49,7 +49,7 @@ import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -145,7 +145,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parse(filesystem.getRoot(), xhtml);
}
@@ -661,7 +661,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
protected void parseWord6(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
parseWord6(filesystem.getRoot(), xhtml);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index d2eb87a..58654cc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -25,13 +25,13 @@ import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.POITextExtractor;
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
@@ -41,9 +41,6 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -69,8 +66,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import javax.xml.parsers.SAXParser;
-
/**
* Base class for all Tika OOXML extractors.
* <p/>
@@ -119,7 +114,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
*/
public POIXMLDocument getDocument() {
- return extractor.getDocument();
+ return (POIXMLDocument)extractor.getDocument();
}
/**
@@ -422,9 +417,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (officeParserConfig.getExtractMacros()) {
try (InputStream is = macroPart.getInputStream()) {
- try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+ try (POIFSFileSystem poifs = new POIFSFileSystem(is)) {
//Macro reading exceptions are already swallowed here
- OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+ OfficeParser.extractMacros(poifs, handler, embeddedExtractor);
}
} catch (IOException e) {
throw new TikaException("Broken OOXML file", e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 30f2975..7484d69 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,13 +16,8 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.math.BigDecimal;
-import java.util.Date;
-
-import org.apache.poi.POIXMLProperties.CoreProperties;
-import org.apache.poi.POIXMLProperties.CustomProperties;
-import org.apache.poi.POIXMLProperties.ExtendedProperties;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.POIXMLProperties;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.openxml4j.util.Nullable;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
@@ -42,6 +37,10 @@ import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import java.math.BigDecimal;
+import java.util.Date;
+import java.util.Optional;
+
/**
* OOXML metadata extractor.
* <p/>
@@ -70,7 +69,7 @@ public class MetadataExtractor {
}
}
- private void extractMetadata(CoreProperties properties, Metadata metadata) {
+ private void extractMetadata(POIXMLProperties.CoreProperties properties, Metadata metadata) {
PackagePropertiesPart propsHolder = properties
.getUnderlyingProperties();
@@ -105,7 +104,7 @@ public class MetadataExtractor {
}
- private void extractMetadata(ExtendedProperties properties,
+ private void extractMetadata(POIXMLProperties.ExtendedProperties properties,
Metadata metadata) {
CTProperties propsHolder = properties.getUnderlyingProperties();
@@ -145,7 +144,7 @@ public class MetadataExtractor {
setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
}
- private void extractMetadata(CustomProperties properties,
+ private void extractMetadata(POIXMLProperties.CustomProperties properties,
Metadata metadata) {
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
props = properties.getUnderlyingProperties();
@@ -229,31 +228,34 @@ public class MetadataExtractor {
}
}
- private <T> void setProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
- T value = nullableValue.getValue();
- if (value != null) {
- if (value instanceof Date) {
- metadata.set(property, (Date) value);
- } else if (value instanceof String) {
- metadata.set(property, (String) value);
- } else if (value instanceof Integer) {
- metadata.set(property, (Integer) value);
- } else if (value instanceof Double) {
- metadata.set(property, (Double) value);
- }
+ private <T> void setProperty(Metadata metadata, Property property, Optional<T> optionalValue) {
+ if (!optionalValue.isPresent()) {
+ return;
+ }
+ T value = optionalValue.get();
+ if (value instanceof Date) {
+ metadata.set(property, (Date) value);
+ } else if (value instanceof String) {
+ metadata.set(property, (String) value);
+ } else if (value instanceof Integer) {
+ metadata.set(property, (Integer) value);
+ } else if (value instanceof Double) {
+ metadata.set(property, (Double) value);
}
}
- private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
- T value = nullableValue.getValue();
- if (value != null) {
- if (value instanceof String) {
- metadata.add(property, (String) value);
- } else {
- throw new IllegalArgumentException("Can't add property of class: "+nullableValue.getClass());
- }
+ private <T> void addProperty(Metadata metadata, Property property, Optional<T> optionalValue) {
+ if (!optionalValue.isPresent()) {
+ return;
+ }
+ T value = optionalValue.get();
+ if (value instanceof String) {
+ metadata.add(property, (String) value);
+ } else {
+ throw new IllegalArgumentException("Can't add property of class: " + optionalValue.getClass());
}
}
+
private void setProperty(Metadata metadata, String name, Nullable<?> value) {
if (value.getValue() != null) {
setProperty(metadata, name, value.getValue().toString());
@@ -284,11 +286,11 @@ public class MetadataExtractor {
}
}
- private void addMultiProperty(Metadata metadata, Property property, Nullable<String> value) {
- if (value == null) {
+ private void addMultiProperty(Metadata metadata, Property property, Optional<String> value) {
+ if (!value.isPresent()) {
return;
}
- SummaryExtractor.addMulti(metadata, property, value.getValue());
+ SummaryExtractor.addMulti(metadata, property, value.get());
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
index f52e52d..4ef723e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
@@ -18,8 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -30,19 +29,19 @@ import org.xml.sax.SAXException;
/**
* Interface implemented by all Tika OOXML extractors.
*
- * @see org.apache.poi.POIXMLTextExtractor
+ * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor
*/
public interface OOXMLExtractor {
/**
* Returns the opened document.
*
- * @see POIXMLTextExtractor#getDocument()
+ * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getDocument()
*/
POIXMLDocument getDocument();
/**
- * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
+ * {@link org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
* for OOXML by POI.
*/
MetadataExtractor getMetadataExtractor();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 5230d65..a6e111a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -21,9 +21,9 @@ import java.io.InputStream;
import java.util.Locale;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -31,8 +31,10 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.LocaleUtil;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -103,15 +105,15 @@ public class OOXMLExtractorFactory {
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
- if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
- poiExtractor = trySXSLF(pkg);
+ if (poiExtractor == null) {
+ poiExtractor = tryXSLF(pkg, config.getUseSAXPptxExtractor());
}
if (type.equals(OOXMLParser.XPS)) {
poiExtractor = new XPSTextExtractor(pkg);
}
if (poiExtractor == null) {
- poiExtractor = ExtractorFactory.createExtractor(pkg);
+ poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
@@ -190,7 +192,7 @@ public class OOXMLExtractorFactory {
return null;
}
- private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
+ private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) throws XmlException, OpenXML4JException, IOException {
PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
if (packageRelationshipCollection.size() == 0) {
@@ -208,12 +210,20 @@ public class OOXMLExtractorFactory {
for (int i = 0; i < xslfRelations.length; i++) {
XSLFRelation xslfRelation = xslfRelations[i];
if (xslfRelation.getContentType().equals(targetContentType)) {
- return new XSLFEventBasedPowerPointExtractor(pkg);
+ if (eventBased) {
+ return new XSLFEventBasedPowerPointExtractor(pkg);
+ } else {
+ return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ }
}
}
if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
- return new XSLFEventBasedPowerPointExtractor(pkg);
+ if (eventBased) {
+ return new XSLFEventBasedPowerPointExtractor(pkg);
+ } else {
+ return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ }
}
return null;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
index f6ec3bf..56d8a71 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
@@ -19,7 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.util.ArrayList;
import java.util.List;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.tika.parser.ParseContext;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 35dba6d..3d929ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -30,9 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.Placeholder;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFComment;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
@@ -59,8 +61,6 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
@@ -136,23 +136,21 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
}
// comments (if present)
- XSLFComments comments = slide.getComments();
+ List<XSLFComment> comments = slide.getComments();
if (comments != null) {
StringBuilder authorStringBuilder = new StringBuilder();
- for (int i = 0; i < comments.getNumberOfComments(); i++) {
+ for (int i = 0; i < comments.size(); i++) {
authorStringBuilder.setLength(0);
- CTComment comment = comments.getCommentAt(i);
+ XSLFComment comment = comments.get(i);
xhtml.startElement("p", "class", "slide-comment");
- CTCommentAuthor cta = commentAuthors.getAuthorById(comment.getAuthorId());
- if (cta != null) {
- if (cta.getName() != null) {
- authorStringBuilder.append(cta.getName());
+ if (comment.getAuthor() != null) {
+ authorStringBuilder.append(comment.getAuthor());
}
- if (cta.getInitials() != null) {
+ if (comment.getAuthorInitials() != null) {
if (authorStringBuilder.length() > 0) {
authorStringBuilder.append(" ");
}
- authorStringBuilder.append("("+cta.getInitials()+")");
+ authorStringBuilder.append("("+comment.getAuthorInitials()+")");
}
if (comment.getText() != null && authorStringBuilder.length() > 0) {
authorStringBuilder.append(" - ");
@@ -162,7 +160,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.characters(authorStringBuilder.toString());
xhtml.endElement("b");
}
- }
+
xhtml.characters(comment.getText());
xhtml.endElement("p");
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 3001318..d4b0cfc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -21,7 +21,7 @@ import java.io.InputStream;
import java.util.List;
import java.util.Locale;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 256fd0f..337ef2d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -27,8 +27,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 50e1e9a..2643a3a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -17,10 +17,11 @@
package org.apache.tika.parser.microsoft.ooxml.xps;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
@@ -29,19 +30,16 @@ import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import javax.xml.parsers.SAXParser;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
@@ -49,7 +47,6 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.zip.ZipEntry;
public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
@@ -249,10 +246,10 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
- Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
- ZipEntry zipEntry = null;
+ Enumeration<? extends ZipArchiveEntry> zipEntryEnumeration = zipEntrySource.getEntries();
+ ZipArchiveEntry zipEntry = null;
while (zipEntryEnumeration.hasMoreElements()) {
- ZipEntry ze = zipEntryEnumeration.nextElement();
+ ZipArchiveEntry ze = zipEntryEnumeration.nextElement();
if (ze.getName().equals(targPath)) {
zipEntry = ze;
break;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index 30aaf0f..0212920 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -17,9 +17,10 @@
package org.apache.tika.parser.microsoft.ooxml.xps;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLProperties;
-import org.apache.poi.POIXMLTextExtractor;
+
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.POIXMLProperties;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.xmlbeans.XmlException;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index af3eb74..76750ca 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -20,9 +20,9 @@ package org.apache.tika.parser.microsoft.ooxml.xslf;
import java.io.IOException;
import java.util.Date;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLProperties;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.POIXMLProperties;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index e0ad943..6ed1be7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -26,9 +26,10 @@ import java.util.List;
import java.util.Map;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLProperties;
-import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.POIXMLProperties;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -36,7 +37,6 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.util.SAXHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index e5b0b44..08174d0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -238,7 +238,7 @@ public class ZipContainerDetector implements Detector {
ZipEntrySource zipEntrySource = null;
try {
- zipEntrySource = new ZipFileZipEntrySource(new java.util.zip.ZipFile(stream.getFile()));
+ zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
} catch (IOException e) {
return null;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 0b3322b..0e1f2b8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -32,7 +32,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.util.IOUtils;
@@ -115,7 +116,7 @@ class RTFObjDataParser {
ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
boolean hasPoifs = false;
try {
- hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs);
+ hasPoifs = hasPOIFSHeader(embIs);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return embObjBytes;
@@ -139,7 +140,7 @@ class RTFObjDataParser {
throws IOException {
byte[] ret = null;
- try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
+ try (POIFSFileSystem fs = new POIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
@@ -328,5 +329,9 @@ class RTFObjDataParser {
return new byte[(int) len];
}
+
+ private static boolean hasPOIFSHeader(InputStream is) throws IOException {
+ return FileMagic.valueOf(is) == FileMagic.OLE2;
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index be90b06..77df753 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -25,10 +25,9 @@ import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
-import java.nio.file.Path;
import java.util.Random;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
@@ -44,7 +43,7 @@ import org.junit.After;
import org.junit.Test;
/**
- * Junit test class for {@link ContainerAwareDetector}
+ * Junit test class for {@link org.apache.tika.parser.microsoft.POIFSContainerDetector}
*/
public class TestContainerAwareDetector extends MultiThreadedTikaTest {
private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
@@ -185,7 +184,7 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
assertEquals(
MediaType.parse("application/vnd.ms-powerpoint"),
detector.detect(stream, new Metadata()));
- assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
+ assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem);
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 475e545..c407b94 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -141,7 +141,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
assertEquals(-1, content.indexOf("*"));
}
@@ -165,7 +165,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
assertEquals(-1, content.indexOf("*"));
}
@@ -184,7 +184,7 @@ public class PowerPointParserTest extends TikaTest {
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
+ //TIKA-1171, POI-62591
assertEquals(-1, content.indexOf("*"));
}
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 3926083..9bd4d50 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -20,7 +20,7 @@ package org.apache.tika.server.resource;
import org.apache.commons.lang.StringUtils;
import org.apache.cxf.attachment.ContentDisposition;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;