You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/22 07:50:48 UTC
svn commit: r1173951 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser:
feed/FeedParser.java image/xmp/JempboxExtractor.java
microsoft/OfficeParser.java microsoft/ooxml/OOXMLExtractorFactory.java
pdf/PDFParser.java
Author: jukka
Date: Thu Sep 22 05:50:47 2011
New Revision: 1173951
URL: http://svn.apache.org/viewvc?rev=1173951&view=rev
Log:
TIKA-709: Tika network server does not print anything in response to, for example, Word documents
Ensure that the parsers won't close the document stream, as specified in the Parser interface contract
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java Thu Sep 22 05:50:47 2011
@@ -26,6 +26,7 @@ import java.util.List;
import java.util.Set;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -65,7 +66,7 @@ public class FeedParser extends Abstract
// set the encoding?
try {
SyndFeed feed = new SyndFeedInput().build(
- new InputSource(stream));
+ new InputSource(new CloseShieldInputStream(stream)));
String title = stripTags(feed.getTitleEx());
String description = stripTags(feed.getDescriptionEx());
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java Thu Sep 22 05:50:47 2011
@@ -44,14 +44,10 @@ public class JempboxExtractor {
public JempboxExtractor(Metadata metadata) {
this.metadata = metadata;
}
-
- public void parse(InputStream file)
- throws IOException, TikaException {
-
+
+ public void parse(InputStream file) throws IOException, TikaException {
ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
- boolean found = scanner.parse(file, xmpraw);
- file.close();
- if (!found) {
+ if (!scanner.parse(file, xmpraw)) {
return;
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Sep 22 05:50:47 2011
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.N
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -166,13 +167,15 @@ public class OfficeParser extends Abstra
NPOIFSFileSystem filesystem;
TikaInputStream tstream = TikaInputStream.cast(stream);
if (tstream == null) {
- filesystem = new NPOIFSFileSystem(stream);
+ filesystem =
+ new NPOIFSFileSystem(new CloseShieldInputStream(stream));
} else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
} else if (tstream.hasFile()) {
filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
} else {
- filesystem = new NPOIFSFileSystem(tstream);
+ filesystem =
+ new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
}
// Parse summary entries first, to make metadata available early
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Sep 22 05:50:47 2011
@@ -32,6 +32,7 @@ import org.apache.poi.xssf.extractor.XSS
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -57,18 +58,19 @@ public class OOXMLExtractorFactory {
OOXMLExtractor extractor;
POIXMLTextExtractor poiExtractor;
- if(stream instanceof TikaInputStream &&
- ((TikaInputStream)stream).getOpenContainer() != null) {
- poiExtractor = ExtractorFactory.createExtractor(
- (OPCPackage)((TikaInputStream)stream).getOpenContainer()
- );
- } else if (stream instanceof TikaInputStream &&
- ((TikaInputStream) stream).hasFile()) {
- poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(((TikaInputStream) stream).getFile());
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
+ poiExtractor = ExtractorFactory.createExtractor(
+ (OPCPackage) tis.getOpenContainer());
+ } else if (tis != null && tis.hasFile()) {
+ poiExtractor = (POIXMLTextExtractor)
+ ExtractorFactory.createExtractor(tis.getFile());
} else {
- poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+ InputStream shield = new CloseShieldInputStream(stream);
+ poiExtractor = (POIXMLTextExtractor)
+ ExtractorFactory.createExtractor(shield);
}
-
+
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu Sep 22 05:50:47 2011
@@ -31,6 +31,7 @@ import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
@@ -71,7 +72,8 @@ public class PDFParser extends AbstractP
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- PDDocument pdfDocument = PDDocument.load(stream, true);
+ PDDocument pdfDocument =
+ PDDocument.load(new CloseShieldInputStream(stream), true);
try {
if (pdfDocument.isEncrypted()) {
try {