You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/07/25 13:46:52 UTC

svn commit: r1613395 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Author: tallison
Date: Fri Jul 25 11:46:51 2014
New Revision: 1613395

URL: http://svn.apache.org/r1613395
Log:
TIKA-1375: decrease memory consumption when extracting images in PDFs

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1613395&r1=1613394&r2=1613395&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Jul 25 11:46:51 2014
@@ -286,6 +286,7 @@ class PDF2XHTML extends PDFTextStripper 
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
         }
+        page.clear();
     }
 
     private void extractImages(PDResources resources) throws SAXException {
@@ -334,6 +335,7 @@ class PDF2XHTML extends PDFTextStripper 
                     ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                     try {
                         image.write2OutputStream(buffer);
+                        image.clear();
                         extractor.parseEmbedded(
                                 new ByteArrayInputStream(buffer.toByteArray()),
                                 new EmbeddedContentHandler(handler),
@@ -344,6 +346,7 @@ class PDF2XHTML extends PDFTextStripper 
                 }
             }
         }
+        resources.clear();
     }
 
     protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {