You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/22 07:50:48 UTC

svn commit: r1173951 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: feed/FeedParser.java image/xmp/JempboxExtractor.java microsoft/OfficeParser.java microsoft/ooxml/OOXMLExtractorFactory.java pdf/PDFParser.java

Author: jukka
Date: Thu Sep 22 05:50:47 2011
New Revision: 1173951

URL: http://svn.apache.org/viewvc?rev=1173951&view=rev
Log:
TIKA-709: Tika network server does not print anything in response to, for example, Word documents

Ensure that the parsers won't close the document stream, as specified in the Parser interface contract

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java Thu Sep 22 05:50:47 2011
@@ -26,6 +26,7 @@ import java.util.List;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
@@ -65,7 +66,7 @@ public class FeedParser extends Abstract
         // set the encoding?
         try {
             SyndFeed feed = new SyndFeedInput().build(
-                    new InputSource(stream));
+                    new InputSource(new CloseShieldInputStream(stream)));
 
             String title = stripTags(feed.getTitleEx());
             String description = stripTags(feed.getDescriptionEx());

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java Thu Sep 22 05:50:47 2011
@@ -44,14 +44,10 @@ public class JempboxExtractor {
     public JempboxExtractor(Metadata metadata) {
         this.metadata = metadata;
     }
-    
-    public void parse(InputStream file)
-            throws IOException, TikaException {
-    	
+
+    public void parse(InputStream file) throws IOException, TikaException {
         ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
-        boolean found = scanner.parse(file, xmpraw);
-        file.close();
-        if (!found) {
+        if (!scanner.parse(file, xmpraw)) {
             return;
         }
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Sep 22 05:50:47 2011
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.N
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -166,13 +167,15 @@ public class OfficeParser extends Abstra
         NPOIFSFileSystem filesystem;
         TikaInputStream tstream = TikaInputStream.cast(stream);
         if (tstream == null) {
-            filesystem = new NPOIFSFileSystem(stream);
+            filesystem =
+                new NPOIFSFileSystem(new CloseShieldInputStream(stream));
         } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
             filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
         } else if (tstream.hasFile()) {
             filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
         } else {
-            filesystem = new NPOIFSFileSystem(tstream);
+            filesystem =
+                new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
         }
 
         // Parse summary entries first, to make metadata available early

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Sep 22 05:50:47 2011
@@ -32,6 +32,7 @@ import org.apache.poi.xssf.extractor.XSS
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -57,18 +58,19 @@ public class OOXMLExtractorFactory {
             OOXMLExtractor extractor;
 
             POIXMLTextExtractor poiExtractor;
-            if(stream instanceof TikaInputStream && 
-            	    ((TikaInputStream)stream).getOpenContainer() != null) {
-               poiExtractor = ExtractorFactory.createExtractor(
-                    (OPCPackage)((TikaInputStream)stream).getOpenContainer()
-               );
-            } else if (stream instanceof TikaInputStream &&
-                    ((TikaInputStream) stream).hasFile()) {
-                poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(((TikaInputStream) stream).getFile());
+            TikaInputStream tis = TikaInputStream.cast(stream);
+            if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
+                poiExtractor = ExtractorFactory.createExtractor(
+                        (OPCPackage) tis.getOpenContainer());
+            } else if (tis != null && tis.hasFile()) {
+                poiExtractor = (POIXMLTextExtractor)
+                        ExtractorFactory.createExtractor(tis.getFile());
             } else {
-               poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+                InputStream shield = new CloseShieldInputStream(stream);
+                poiExtractor = (POIXMLTextExtractor)
+                        ExtractorFactory.createExtractor(shield);
             }
-            
+
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                extractor = new XSSFExcelExtractorDecorator(

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu Sep 22 05:50:47 2011
@@ -31,6 +31,7 @@ import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
@@ -71,7 +72,8 @@ public class PDFParser extends AbstractP
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        PDDocument pdfDocument = PDDocument.load(stream, true);
+        PDDocument pdfDocument =
+            PDDocument.load(new CloseShieldInputStream(stream), true);
         try {
             if (pdfDocument.isEncrypted()) {
                 try {