You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/03 20:29:23 UTC
[tika] branch master updated: TIKA-2765
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 93a2c27 TIKA-2765
93a2c27 is described below
commit 93a2c2740edd803cba0273dec92e69c1f2a60dbe
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jan 3 15:29:11 2019 -0500
TIKA-2765
---
CHANGES.txt | 5 +-
.../src/test/java/org/apache/tika/TikaTest.java | 28 +++-
.../parser/microsoft/POIFSContainerDetector.java | 5 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 80 ++++++++++-
.../tika/parser/pkg/ZipContainerDetector.java | 147 +++++++++++++++++++-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 152 +++++++++++++++++++++
6 files changed, 406 insertions(+), 11 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8bd4fd5..376a109 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,7 +5,10 @@ Release 2.0.0 - ???
Other changes
-Release 1.21 - ???
+
+Release 1.21 - ????
+
+ * Try to handle truncated OOXML files more robustly (TIKA-2765).
Release 1.20 - 12/17/2018
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index dde60a3..0de69aa 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -211,6 +211,10 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(String filePath, boolean suppressException) throws Exception {
+ return getRecursiveMetadata(filePath, new ParseContext(), new Metadata(), suppressException);
+ }
+
protected List<Metadata> getRecursiveMetadata(String filePath) throws Exception {
return getRecursiveMetadata(filePath, new ParseContext());
}
@@ -220,18 +224,36 @@ public abstract class TikaTest {
}
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
+ return getRecursiveMetadata(filePath, context, metadata, false);
+ }
+
+ protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata,
+ boolean suppressException) throws Exception {
+ try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+ return getRecursiveMetadata(is, context, metadata, suppressException);
+ }
+ }
+
+ protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
+ return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
+ }
+ protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata,
+ boolean suppressException) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
-
- try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+ try {
wrapper.parse(is, handler, metadata, context);
+ } catch (Exception e) {
+ if (!suppressException) {
+ throw e;
+ }
}
return handler.getMetadataList();
}
- protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 1b5a0a9..6f32984 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.application;
import java.io.File;
@@ -440,7 +441,9 @@ public class POIFSContainerDetector implements Detector {
|| input.read() != 0x1a || input.read() != 0xe1) {
return MediaType.OCTET_STREAM;
}
- } finally {
+ } catch (IOException e) {
+ return MediaType.OCTET_STREAM;
+ } finally {
input.reset();
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index a6e111a..bcf8ea8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,15 +16,22 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
@@ -41,11 +48,13 @@ import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.core.ChmExtractor;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
@@ -53,6 +62,8 @@ import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtra
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
import org.apache.xmlbeans.XmlException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -62,6 +73,8 @@ import org.xml.sax.SAXException;
*/
public class OOXMLExtractorFactory {
+ private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
+
public static void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
@@ -69,16 +82,26 @@ public class OOXMLExtractorFactory {
Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
ExtractorFactory.setThreadPrefersEventExtractors(true);
+ //if there's a problem opening the zip file;
+ //create a tmp file, and copy what you can read of it.
+ File tmpRepairedCopy = null;
+
+ OPCPackage pkg = null;
try {
OOXMLExtractor extractor = null;
- OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
- pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+ try {
+ pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+ } catch (InvalidOperationException e) {
+ tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", "");
+ repairCopy(tis.getFile(), tmpRepairedCopy);
+ pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
+ }
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
@@ -169,7 +192,60 @@ public class OOXMLExtractorFactory {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
+ } finally {
+ if (tmpRepairedCopy != null) {
+ if (pkg != null) {
+ try {
+ pkg.close();
+ } catch (IOException e) {
+ LOG.warn("problem closing pkg file");
+ }
+ }
+ boolean deleted = tmpRepairedCopy.delete();
+ if (! deleted) {
+ LOG.warn("failed to delete tmp (repair) file: "+tmpRepairedCopy.getAbsolutePath());
+ }
+ }
+ }
+ }
+
+ private static void repairCopy(File brokenZip, File fixedZip) {
+ try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(fixedZip)) {
+ try (InputStream is = new FileInputStream(brokenZip)) {
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ while (zae != null) {
+ try {
+ if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+ outputStream.putArchiveEntry(zae);
+ //this will copy an incomplete stream...so there
+ //could be truncation of the xml, but the zip file
+ //should be intact.
+ boolean successfullyCopied = false;
+ try {
+ IOUtils.copy(zipArchiveInputStream, outputStream);
+ successfullyCopied = true;
+ } catch (IOException e) {
+ //this can hit a "truncated ZipFile" IOException
+ }
+ outputStream.flush();
+ outputStream.closeArchiveEntry();
+ if (!successfullyCopied) {
+ break;
+ }
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ } catch (EOFException e) {
+ break;
+ }
+ }
+ outputStream.flush();
+ outputStream.finish();
+ outputStream.close();
+ }
+ } catch (IOException e) {
+ LOG.warn("problem fixing zip", e);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 08174d0..3f2303b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -19,37 +19,48 @@ package org.apache.tika.parser.pkg;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
-import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
+import java.util.Map;
import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import static java.nio.charset.StandardCharsets.UTF_8;
@@ -83,7 +94,43 @@ public class ZipContainerDetector implements Detector {
private static final String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
-
+
+ private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
+ private static final MediaType DOCX =
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ private static final MediaType DOCM =
+ MediaType.application("vnd.ms-word.document.macroEnabled.12");
+ private static final MediaType DOTX =
+ MediaType.application("vnd.ms-word.document.macroEnabled.12");
+ private static final MediaType PPTX =
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+ private static final MediaType PPTM =
+ MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
+ private static final MediaType POTX =
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
+ private static final MediaType XLSX =
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ private static final MediaType XLSM =
+ MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
+
+ private static final Set<String> OOXML_HINTS = fillSet(
+ "word/document.xml",
+ "_rels/.rels",
+ "[Content_Types].xml",
+ "ppt/presentation.xml",
+ "ppt/slides/slide1.xml",
+ "xl/workbook.xml",
+ "xl/sharedStrings.xml",
+ "xl/worksheets/sheet1.xml"
+ );
+
+ static Set<String> fillSet(String ... args) {
+ Set<String> tmp = new HashSet<>();
+ for (String arg : args) {
+ tmp.add(arg);
+ }
+ return Collections.unmodifiableSet(tmp);
+ }
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -240,7 +287,7 @@ public class ZipContainerDetector implements Detector {
try {
zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
} catch (IOException e) {
- return null;
+ return tryStreamingDetection(stream);
}
//if (zip.getEntry("_rels/.rels") != null
@@ -486,4 +533,96 @@ public class ZipContainerDetector implements Detector {
// If we get here, not all required entries were found
return null;
}
+
+ private static MediaType tryStreamingDetection(TikaInputStream stream) {
+ Set<String> entryNames = new HashSet<>();
+ try (InputStream is = new FileInputStream(stream.getFile())) {
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ while (zae != null) {
+ if (zae.isDirectory()) {
+ zae = zipArchiveInputStream.getNextZipEntry();
+ continue;
+ }
+ entryNames.add(zae.getName());
+ //we could also parse _rel/.rels, but if
+ // there isn't a valid content_types, then POI
+ //will throw an exception...Better to backoff to PKG
+ //than correctly identify a truncated
+ if (zae.getName().equals("[Content_Types].xml")) {
+ MediaType mt = parseContentTypes(zipArchiveInputStream);
+ if (mt != null) {
+ return mt;
+ }
+ return TIKA_OOXML;
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //swallow
+ }
+ int hits = 0;
+ for (String s : OOXML_HINTS) {
+ if (entryNames.contains(s)) {
+ hits++;
+ }
+ }
+ if (hits > 2) {
+ return TIKA_OOXML;
+ }
+ return MediaType.APPLICATION_ZIP;
+ }
+
+ private static MediaType parseContentTypes(InputStream is) {
+ ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
+ try {
+ XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+
+ }
+ return contentTypeHandler.mediaType;
+ }
+
+
+ private static class ContentTypeHandler extends DefaultHandler {
+ static Map<String, MediaType> CONTENT_TYPES = new ConcurrentHashMap<>();
+ static {
+ CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
+ CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM);
+ CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
+
+ CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
+ CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM);
+ CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPTX);
+ CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
+ CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX);
+ }
+
+ private MediaType mediaType = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String name, Attributes attrs) throws SAXException {
+ for (int i = 0; i < attrs.getLength(); i++) {
+ String attrName = attrs.getLocalName(i);
+ if (attrName.equals("ContentType")) {
+ String contentType = attrs.getValue(i);
+ if (CONTENT_TYPES.containsKey(contentType)) {
+ mediaType = CONTENT_TYPES.get(contentType);
+ throw new StoppingEarlyException();
+ }
+
+ }
+ }
+ }
+ }
+
+ private static class StoppingEarlyException extends SAXException {
+
+ }
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
new file mode 100644
index 0000000..81c588d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class TruncatedOOXMLTest extends TikaTest {
+
+ @Test
+ public void testWordTrunc14435() throws Exception {
+ //this is only very slightly truncated
+ List<Metadata> metadataList = getRecursiveMetadata(truncate(
+ "testWord_various.docx", 14435), true);
+ assertEquals(1, metadataList.size());
+ Metadata metadata = metadataList.get(0);
+ String content = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("This is the header", content);
+ assertContains("This is the footer text", content);
+ assertContains("Suddenly some Japanese", content);
+ }
+
+ @Test
+ public void testWordTrunc13138() throws Exception {
+ //this truncates the content_types.xml
+ //this tests that there's a backoff to the pkg parser
+ List<Metadata> metadataList = getRecursiveMetadata(truncate(
+ "testWord_various.docx", 13138), true);
+ assertEquals(19, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testWordTrunc774() throws Exception {
+ //this is really truncated
+ List<Metadata> metadataList = getRecursiveMetadata(truncate(
+ "testWord_various.docx", 774), true);
+ assertEquals(4, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testTruncation() throws Exception {
+
+ int length = (int)getResourceAsFile("/test-documents/testWORD_various.docx").length();
+ Random r = new Random();
+ for (int i = 0; i < 50; i++) {
+ int targetLength = r.nextInt(length);
+ InputStream is = truncate("testWORD_various.docx", targetLength);
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(is, bos);
+ assertEquals(targetLength, bos.toByteArray().length);
+ }
+ try {
+ InputStream is = truncate("testWORD_various.docx", length+1);
+ fail("should have thrown EOF");
+ } catch (EOFException e) {
+
+ }
+ }
+
+ private InputStream truncate(String fileName, int length) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ int bufferSize = 4096;
+ byte[] buffer = new byte[bufferSize];
+ int bytesRead = 0;
+ int toRead = length;
+ try (InputStream is = getResourceAsStream("/test-documents/"+fileName)) {
+ while (toRead > 0) {
+ int justRead = is.read(buffer, 0, Math.min(bufferSize, toRead));
+ if (justRead == -1) {
+ throw new EOFException("eof reached");
+ }
+ bos.write(buffer, 0, justRead);
+ toRead -= justRead;
+ }
+ }
+ return new ByteArrayInputStream(bos.toByteArray());
+ }
+
+ @Test
+ @Ignore("for dev/debugging only")
+ public void listStreams() throws Exception {
+ File tstDir = new File(TruncatedOOXMLTest.class.getResource("/test-documents").toURI());
+ for (File f : tstDir.listFiles()) {
+ if (f.isDirectory()) {
+ continue;
+ }
+ if (f.getName().endsWith(".xlsx")) {// || f.getName().endsWith(".pptx") || f.getName().endsWith(".docx")) {
+
+ } else {
+ continue;
+ }
+ try (InputStream is = new FileInputStream(f)) {
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ int cnt = 0;
+ while (zae != null && ! zae.isDirectory() && ++cnt <= 10) {
+ System.out.println(f.getName() + " : " + zae.getName());
+ if (zae.getName().equals("_rels/.rels")) {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(zipArchiveInputStream, bos);
+ System.out.println(new String(bos.toByteArray(), StandardCharsets.UTF_8));
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ }
+ } catch (Exception e) {
+ System.out.println(f.getName() + " : "+e.getMessage());
+ }
+ }
+ }
+}