You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/21 19:51:45 UTC
[1/2] tika git commit: TIKA-2013 -- upgrade to POI 3.15 -- don't
forget to close new NPOIFS and MAPIMessage
Repository: tika
Updated Branches:
refs/heads/2.x 1b32e3186 -> 32d9ece8d
TIKA-2013 -- upgrade to POI 3.15 -- don't forget to close new NPOIFS and MAPIMessage
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/12b1d435
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/12b1d435
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/12b1d435
Branch: refs/heads/2.x
Commit: 12b1d435bbdc5df9d5e396285c83ddeda44240ae
Parents: 1b32e31
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 14:23:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 14:23:00 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 4 +--
tika-bundle/pom.xml | 4 +--
.../tika-parser-office-bundle/pom.xml | 1 +
tika-parser-modules/pom.xml | 2 +-
.../parser/microsoft/JackcessExtractor.java | 5 +--
.../tika/parser/microsoft/OfficeParser.java | 38 ++++++++++++--------
.../tika/parser/microsoft/OutlookExtractor.java | 12 +++++--
7 files changed, 42 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index baee8b4..662217d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Upgrade to POI 3.15-final (TIKA-2013).
+
* Upgrade to PDFBox 2.0.3 (TIKA-2051).
* Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040).
@@ -45,8 +47,6 @@ Release 1.14 - ???
* iCal and vCalendar (TIKA-2006)
* MBOX (TIKA-2042)
- * Upgrade to PDFBox 2.0.2 (TIKA-1996).
-
* Add configurable maximum threshold for number of events extracted
from the XMP Media Management Schema in JempboxExtractor (TIKA-1999).
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 3b7a6ce..e8f3e83 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -126,7 +126,7 @@
<Embed-Dependency>
tika-parsers;inline=true,
commons-compress, xz, commons-codec, commons-csv,
- commons-io, commons-exec, junrar,
+ commons-io, commons-exec, commons-collections4, junrar,
pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
curvesapi,
@@ -444,4 +444,4 @@
<system>Jenkins</system>
<url>https://builds.apache.org/job/Tika-trunk/</url>
</ciManagement>
-</project>
+</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index f6b2169..1529c97 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -65,6 +65,7 @@
commons-lang;inline=true,
commons-io;inline=true,
commons-codec;inline=true,
+ commons-collections4;inline=true,
poi;inline=true,
poi-scratchpad;inline=true,
poi-ooxml;inline=true,
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index ef92a7c..dc3b409 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.15-beta1</poi.version>
+ <poi.version>3.15</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<pdfbox.version>2.0.3</pdfbox.version>
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index fb8a2c2..4f26ff0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -328,8 +328,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
- NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
- handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ try (NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream())) {
+ handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ }
}
String formatCurrency(Double d, DataType type) {
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index f5f9f3e..b6681aa 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -95,26 +96,33 @@ public class OfficeParser extends AbstractParser {
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
- if (tstream == null) {
- root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
- } else {
- final Object container = tstream.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- root = ((NPOIFSFileSystem) container).getRoot();
- } else if (container instanceof DirectoryNode) {
- root = (DirectoryNode) container;
+ NPOIFSFileSystem mustCloseFs = null;
+ try {
+ if (tstream == null) {
+ mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
+ root = mustCloseFs.getRoot();
} else {
- NPOIFSFileSystem fs;
- if (tstream.hasFile()) {
- fs = new NPOIFSFileSystem(tstream.getFile(), true);
+ final Object container = tstream.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ root = ((NPOIFSFileSystem) container).getRoot();
+ } else if (container instanceof DirectoryNode) {
+ root = (DirectoryNode) container;
} else {
- fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ NPOIFSFileSystem fs = null;
+ if (tstream.hasFile()) {
+ fs = new NPOIFSFileSystem(tstream.getFile(), true);
+ } else {
+ fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ }
+ //tstream will close the fs, no need to close this below
+ tstream.setOpenContainer(fs);
+ root = fs.getRoot();
}
- tstream.setOpenContainer(fs);
- root = fs.getRoot();
}
+ parse(root, context, metadata, xhtml);
+ } finally {
+ IOUtils.closeQuietly(mustCloseFs);
}
- parse(root, context, metadata, xhtml);
xhtml.endDocument();
}
http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a922c5d..74a95e7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.microsoft;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
@@ -62,8 +64,6 @@ import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* Outlook Message Parser.
*/
@@ -260,6 +260,14 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+ } finally {
+ if (msg != null) {
+ try {
+ msg.close();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
}
}
[2/2] tika git commit: * Maintain passed-in mime in TXTParser
(TIKA-2047).
Posted by ta...@apache.org.
* Maintain passed-in mime in TXTParser (TIKA-2047).
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32d9ece8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32d9ece8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32d9ece8
Branch: refs/heads/2.x
Commit: 32d9ece8d84986de240087a580e094de3f879f3c
Parents: 12b1d43
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 15:51:02 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 15:51:02 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 ++
.../main/java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++-
.../java/org/apache/tika/parser/txt/TXTParserTest.java | 10 ++++++++--
3 files changed, 21 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 662217d..46a5894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Maintain passed-in mime in TXTParser (TIKA-2047).
+
* Upgrade to POI 3.15-final (TIKA-2013).
* Upgrade to PDFBox 2.0.3 (TIKA-2051).
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 2b20495..2e7bb19 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser {
try (AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata,
context.get(ServiceLoader.class, LOADER))) {
+ //try to get detected content type; could be a subclass of text/plain
+ //such as vcal, etc.
+ String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType mediaType = MediaType.TEXT_PLAIN;
+ if (incomingMime != null) {
+ MediaType tmpMediaType = MediaType.parse(incomingMime);
+ if (tmpMediaType != null) {
+ mediaType = tmpMediaType;
+ }
+ }
Charset charset = reader.getCharset();
- MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 9d9a138..17e5ba1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -196,7 +196,7 @@ public class TXTParserTest extends TikaTest {
parser.parse(
new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
@@ -268,7 +268,13 @@ public class TXTParserTest extends TikaTest {
parser.parse(
new ByteArrayInputStream(text.getBytes(UTF_8)),
new BodyContentHandler(), r.metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/binary; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
}
+ //TIKA-2047
+ @Test
+ public void testSubclassingMimeTypesRemain() throws Exception {
+ XMLResult r = getXML("testVCalendar.vcs");
+ assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
}