You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/01/19 12:27:30 UTC

svn commit: r1060770 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java

Author: jukka
Date: Wed Jan 19 11:27:30 2011
New Revision: 1060770

URL: http://svn.apache.org/viewvc?rev=1060770&view=rev
Log:
TIKA-567: Temporary file leak in TikaInputStream

Prevent ParserContainerExtractor from leaking temporary files.
Also refactored the code a bit to simplify and clean up things.

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java?rev=1060770&r1=1060769&r2=1060770&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java Wed Jan 19 11:27:30 2011
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.extractor;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;
@@ -24,6 +25,7 @@ import org.apache.tika.config.TikaConfig
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -35,18 +37,15 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * An implementation of {@link ContainerExtractor} powered by the
- *  regular {@link Parser} classes.
- * This allows you to easily extract out all the embedded resources
- *  from within contain files, whilst using the normal parsers
- *  to do the work.
- * By default the {@link AutoDetectParser} will be used, to allow
- *  extraction from the widest range of containers.
+ * An implementation of {@link ContainerExtractor} powered by the regular
+ * {@link Parser} API. This allows you to easily extract out all the
+ * embedded resources from within container files supported by normal Tika
+ * parsers. By default the {@link AutoDetectParser} will be used, to allow
+ * extraction from the widest range of containers.
  */
 public class ParserContainerExtractor implements ContainerExtractor {
-    /**
-     * Serial version UID
-     */
+
+    /** Serial version UID */
     private static final long serialVersionUID = 2261131045580861514L;
 
     private final Parser parser;
@@ -73,54 +72,71 @@ public class ParserContainerExtractor im
     }
 
     public void extract(
-            TikaInputStream stream, final ContainerExtractor recurseExtractor,
-            final EmbeddedResourceHandler handler)
+            TikaInputStream stream, ContainerExtractor recurseExtractor,
+            EmbeddedResourceHandler handler)
             throws IOException, TikaException {
         ParseContext context = new ParseContext();
-        context.set(Parser.class, new Parser() {
-            public Set<MediaType> getSupportedTypes(ParseContext context) {
-                return parser.getSupportedTypes(context);
-            }
-            public void parse(InputStream stream, ContentHandler ignored,
-                    Metadata metadata, ParseContext context)
-                    throws IOException, SAXException, TikaException {
+        context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
+        try {
+            parser.parse(stream, new DefaultHandler(), new Metadata(), context);
+        } catch (SAXException e) {
+            throw new TikaException("Unexpected SAX exception", e);
+        }
+    }
+
+    private class RecursiveParser implements Parser {
+
+        private final ContainerExtractor extractor;
+
+        private final EmbeddedResourceHandler handler;
+
+        private RecursiveParser(
+                ContainerExtractor extractor,
+                EmbeddedResourceHandler handler) {
+            this.extractor = extractor;
+            this.handler = handler;
+        }
+
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            return parser.getSupportedTypes(context);
+        }
+
+        public void parse(
+                InputStream stream, ContentHandler ignored,
+                Metadata metadata, ParseContext context)
+                throws IOException, SAXException, TikaException {
+            TemporaryFiles tmp = new TemporaryFiles();
+            try {
+                TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
                 // Figure out what we have to process
                 String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
-                MediaType type;
-                if(metadata.get(Metadata.CONTENT_TYPE) != null) {
-                   type = MediaType.parse( metadata.get(Metadata.CONTENT_TYPE) );
+                MediaType type = detector.detect(tis, metadata);
+
+                if (extractor == null) {
+                    // Let the handler process the embedded resource 
+                    handler.handle(filename, type, tis);
                 } else {
-                   if(! stream.markSupported()) {
-                      stream = TikaInputStream.get(stream);
-                   }
-                   type = detector.detect(stream, metadata);
-                }
-                
-                // Let the handler process the embedded resource 
-                handler.handle(filename, type, stream);
-                
-                // Recurse if requested
-                if(recurseExtractor != null) {
-                   if(recurseExtractor == ParserContainerExtractor.this) {
-                      parser.parse(stream, new DefaultHandler(), metadata, context);
-                   } else {
-                      recurseExtractor.extract(
-                            TikaInputStream.get(stream), recurseExtractor, handler
-                      );
-                   }
+                    // Use a temporary file to process the stream twice
+                    File file = tis.getFile();
+
+                    // Let the handler process the embedded resource 
+                    handler.handle(filename, type, TikaInputStream.get(file));
+
+                    // Recurse
+                    extractor.extract(tis, extractor, handler);
                 }
+            } finally {
+                tmp.dispose();
             }
-            public void parse(InputStream stream, ContentHandler handler,
-                    Metadata metadata) throws IOException, SAXException,
-                    TikaException {
-                parse(stream, handler, metadata, new ParseContext());
-            }
-        });
-        try {
-            parser.parse(stream, new DefaultHandler(), new Metadata(), context);
-        } catch (SAXException e) {
-            throw new TikaException("Unexpected SAX exception", e);
         }
+
+        public void parse(
+                InputStream stream, ContentHandler handler, Metadata metadata)
+                throws IOException, SAXException, TikaException {
+            parse(stream, handler, metadata, new ParseContext());
+        }
+
     }
 
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1060770&r1=1060769&r2=1060770&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Wed Jan 19 11:27:30 2011
@@ -46,8 +46,8 @@ public abstract class AbstractPOIContain
     public static final MediaType TYPE_JPG = MediaType.image("jpeg");
     public static final MediaType TYPE_GIF = MediaType.image("gif");
     public static final MediaType TYPE_PNG = MediaType.image("png");
-    public static final MediaType TYPE_EMF = MediaType.image("x-emf");
-    
+    public static final MediaType TYPE_EMF = MediaType.application("x-msmetafile");
+
     protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
         InputStream input = AbstractPOIContainerExtractionTest.class.getResourceAsStream(
              "/test-documents/" + filename);