You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2010/10/14 11:05:40 UTC

svn commit: r1022431 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/ main/java/org/apache/pdfbox/cos/ main/java/org/apache/pdfbox/pdfparser/ main/java/org/apache/pdfbox/util/ test/java/org/apache/pdfbox/pdfparser/ test/java/org/apache/pdfbo...

Author: jukka
Date: Thu Oct 14 09:05:39 2010
New Revision: 1022431

URL: http://svn.apache.org/viewvc?rev=1022431&view=rev
Log:
PDFBOX-789: Error by text extraction

Extend the forceParsing flag to cover hex strings and all kinds of stream parsers.

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java Thu Oct 14 09:05:39 2010
@@ -229,6 +229,7 @@ public class ExtractText
                 {
                     stripper = new PDFTextStripper(encoding);
                 }
+                stripper.setForceParsing( force );
                 stripper.setSortByPosition( sort );
                 stripper.setShouldSeparateByBeads( separateBeads );
                 stripper.setStartPage( startPage );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Thu Oct 14 09:05:39 2010
@@ -72,23 +72,62 @@ public class COSDocument extends COSBase
     /**
      * This file will store the streams in order to conserve memory.
      */
-    private RandomAccess scratchFile = null;
+    private final RandomAccess scratchFile;
 
-    private File tmpFile = null;
+    private final File tmpFile;
 
     private String headerString = "%PDF-1.4";
 
     private boolean warnMissingClose = true;
 
+    private boolean closed = false;
+
+    /**
+     * Flag to skip malformed or otherwise unparseable input where possible.
+     */
+    private final boolean forceParsing;
+
+    /**
+     * Constructor that will use the given random access file for storage
+     * of the PDF streams. The client of this method is responsible for
+     * deleting the storage if necessary that this file will write to. The
+     * close method will close the file though.
+     *
+     * @param scratchFile the random access file to use for storage
+     * @param forceParsing flag to skip malformed or otherwise unparseable
+     *                     document content where possible
+     */
+    public COSDocument(RandomAccess scratchFile, boolean forceParsing) {
+        this.scratchFile = scratchFile;
+        this.tmpFile = null;
+        this.forceParsing = forceParsing;
+    }
+
+    /**
+     * Constructor that will use a temporary file in the given directory
+     * for storage of the PDF streams. The temporary file is automatically
+     * removed when this document gets closed.
+     *
+     * @param scratchDir directory for the temporary file,
+     *                   or <code>null</code> to use the system default
+     * @param forceParsing flag to skip malformed or otherwise unparseable
+     *                     document content where possible
+     */
+    public COSDocument(File scratchDir, boolean forceParsing)
+            throws IOException {
+        this.tmpFile = File.createTempFile("pdfbox-", ".tmp", scratchDir);
+        this.scratchFile = new RandomAccessFile(tmpFile, "rw");
+        this.forceParsing = forceParsing;
+    }
+
     /**
      * Constructor.  Uses the java.io.tmpdir value to create a file
      * to store the streams.
      *
      *  @throws IOException If there is an error creating the tmp file.
      */
-    public COSDocument() throws IOException
-    {
-        this( new File( System.getProperty( "java.io.tmpdir" ) ) );
+    public COSDocument() throws IOException {
+        this((File) null);
     }
 
     /**
@@ -99,10 +138,8 @@ public class COSDocument extends COSBase
      *
      *  @throws IOException If there is an error creating the tmp file.
      */
-    public COSDocument( File scratchDir ) throws IOException
-    {
-        tmpFile = File.createTempFile( "pdfbox", "tmp", scratchDir );
-        scratchFile = new RandomAccessFile( tmpFile, "rw" );
+    public COSDocument(File scratchDir) throws IOException {
+        this(scratchDir, false);
     }
 
     /**
@@ -113,9 +150,8 @@ public class COSDocument extends COSBase
      *
      * @param file The random access file to use for storage.
      */
-    public COSDocument( RandomAccess file )
-    {
-        scratchFile = file;
+    public COSDocument(RandomAccess file) {
+        this(file, false);
     }
 
     /**
@@ -379,15 +415,12 @@ public class COSDocument extends COSBase
      */
     public void close() throws IOException
     {
-        if( scratchFile != null )
-        {
+        if (!closed) {
             scratchFile.close();
-            scratchFile = null;
-        }
-        if( tmpFile != null )
-        {
-            tmpFile.delete();
-            tmpFile = null;
+            if (tmpFile != null) {
+                tmpFile.delete();
+            }
+            closed = true;
         }
     }
 
@@ -399,12 +432,12 @@ public class COSDocument extends COSBase
      */
     protected void finalize() throws IOException
     {
-        if( this.warnMissingClose && ( tmpFile != null || scratchFile != null ) )
-        {
-            Throwable t = new Throwable( "Warning: You did not close the PDF Document" );
-            t.printStackTrace();
+        if (!closed) {
+            if (warnMissingClose) {
+                log.warn( "Warning: You did not close a PDF Document" );
+            }
+            close();
         }
-        close();
     }
 
     /**
@@ -445,7 +478,8 @@ public class COSDocument extends COSBase
         for( COSObject objStream : getObjectsByType( "ObjStm" ) )
         {
             COSStream stream = (COSStream)objStream.getObject();
-            PDFObjectStreamParser parser = new PDFObjectStreamParser( stream, this );
+            PDFObjectStreamParser parser =
+                new PDFObjectStreamParser(stream, this, forceParsing);
             parser.parse();
             for( COSObject next : parser.getObjects() )
             {
@@ -524,7 +558,8 @@ public class COSDocument extends COSBase
         {
             COSStream stream = (COSStream)xrefStream.getObject();
             trailerDict.addAll(stream);
-            PDFXrefStreamParser parser = new PDFXrefStreamParser(stream, this);
+            PDFXrefStreamParser parser =
+                new PDFXrefStreamParser(stream, this, forceParsing);
             parser.parse();
         }
         setTrailer( trailerDict );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java Thu Oct 14 09:05:39 2010
@@ -172,8 +172,22 @@ public class COSString extends COSBase
      * @return A cos string with the hex characters converted to their actual bytes.
      * @throws IOException If there is an error with the hex string.
      */
-    public static COSString createFromHexString( String hex ) throws IOException
-    {
+    public static COSString createFromHexString(String hex)
+            throws IOException {
+        return createFromHexString(hex, false);
+    }
+
+    /**
+     * Creates a COS string from a string of hex characters, optionally
+     * ignoring malformed input.
+     *
+     * @param hex A hex string.
+     * @param force flag to ignore malformed input
+     * @return A cos string with the hex characters converted to their actual bytes.
+     * @throws IOException If there is an error with the hex string.
+     */
+    public static COSString createFromHexString(String hex, boolean force)
+            throws IOException {
         COSString retval = new COSString();
         StringBuilder hexBuffer = new StringBuilder( hex.trim() );
         //if odd number then the last hex digit is assumed to be 0
@@ -182,16 +196,19 @@ public class COSString extends COSBase
             hexBuffer.append( '0' );
         }
         int length = hexBuffer.length(); 
-        for( int i=0; i<length;)
-        {
-            String hexChars = String.valueOf(hexBuffer.charAt( i++ )) + hexBuffer.charAt( i++ );
-            try
-            {
-                retval.append( Integer.parseInt( hexChars, 16 ) );
-            }
-            catch( NumberFormatException e )
-            {
-                throw new IOException( "Error: Expected hex number, actual='" + hexChars + "'" );
+        for (int i = 0; i < length; i += 2) {
+            try {
+                retval.append(
+                        Integer.parseInt(hexBuffer.substring(i, i + 2), 16));
+            } catch (NumberFormatException e) {
+                if (force) {
+                    retval.append('?');
+                } else {
+                    IOException exception =
+                        new IOException("Invalid hex string: " + hex);
+                    exception.initCause(e);
+                    throw exception;
+                }
             }
         }
         return retval;

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Oct 14 09:05:39 2010
@@ -17,13 +17,13 @@
 package org.apache.pdfbox.pdfparser;
 
 import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.pdfbox.io.ByteArrayPushBackInputStream;
 import org.apache.pdfbox.io.PushBackInputStream;
 import org.apache.pdfbox.io.RandomAccess;
 
@@ -90,6 +90,12 @@ public abstract class BaseParser
     public static final String DEF = "def";
 
     /**
+     * Default value of the {@link #forceParsing} flag.
+     */
+    protected static final boolean FORCE_PARSING =
+        Boolean.getBoolean("org.apache.pdfbox.forceParsing");
+
+    /**
      * This is the stream that will be read from.
      */
     protected PushBackInputStream pdfSource;
@@ -98,30 +104,46 @@ public abstract class BaseParser
      * This is the document that will be parsed.
      */
     protected COSDocument document;
-    
+
+    /**
+     * Flag to skip malformed or otherwise unparseable input where possible.
+     */
+    protected final boolean forceParsing;
+
     /**
      * Constructor.
      *
+     * @since Apache PDFBox 1.3.0
      * @param input The input stream to read the data from.
+     * @param forceParcing flag to skip malformed or otherwise unparseable
+     *                     input where possible
+     * @throws IOException If there is an error reading the input stream.
+     */
+    public BaseParser(InputStream input, boolean forceParsing)
+            throws IOException {
+        this.pdfSource = new PushBackInputStream(
+                new BufferedInputStream(input, 16384),  4096);
+        this.forceParsing = forceParsing;
+    }
+
+    /**
+     * Constructor.
      *
+     * @param input The input stream to read the data from.
      * @throws IOException If there is an error reading the input stream.
      */
-    public BaseParser( InputStream input) throws IOException
-    {
-        //pdfSource = new PushBackByteArrayStream( input );
-        pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
+    public BaseParser(InputStream input) throws IOException {
+        this(input, FORCE_PARSING);
     }
 
     /**
      * Constructor.
      *
      * @param input The array to read the data from.
-     *
      * @throws IOException If there is an error reading the byte data.
      */
-    protected BaseParser(byte[] input) throws IOException
-    {
-        pdfSource = new ByteArrayPushBackInputStream(input);
+    protected BaseParser(byte[] input) throws IOException {
+        this(new ByteArrayInputStream(input));
     }
 
     /**
@@ -747,7 +769,8 @@ public abstract class BaseParser
         }
         if( openBrace == '<' )
         {
-            retval = COSString.createFromHexString( retval.getString() );
+            retval = COSString.createFromHexString(
+                    retval.getString(), forceParsing);
         }
         return retval;
     }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Thu Oct 14 09:05:39 2010
@@ -50,19 +50,35 @@ public class PDFObjectStreamParser exten
     /**
      * Constructor.
      *
+     * @since Apache PDFBox 1.3.0
      * @param strm The stream to parse.
      * @param doc The document for the current parsing.
-     *
+     * @param forceParcing flag to skip malformed or otherwise unparseable
+     *                     input where possible
      * @throws IOException If there is an error initializing the stream.
      */
-    public PDFObjectStreamParser( COSStream strm, COSDocument doc ) throws IOException
-    {
-       super( strm.getUnfilteredStream() );
+    public PDFObjectStreamParser(
+            COSStream strm, COSDocument doc, boolean forceParsing)
+            throws IOException {
+       super(strm.getUnfilteredStream(), forceParsing);
        setDocument( doc );
        stream = strm;
     }
 
     /**
+     * Constructor.
+     *
+     * @param strm The stream to parse.
+     * @param doc The document for the current parsing.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFObjectStreamParser(COSStream strm, COSDocument doc)
+            throws IOException {
+        this(strm, doc, FORCE_PARSING);
+    }
+
+    /**
      * This will parse the tokens in the stream.  This will close the
      * stream when it is finished parsing.
      *

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Thu Oct 14 09:05:39 2010
@@ -59,8 +59,6 @@ public class PDFParser extends BaseParse
 
     private static final String PDF_HEADER = "%PDF-";
     private static final String FDF_HEADER = "%FDF-";
-    private boolean forceParsing = false; 
-    
     /**
      * A list of duplicate objects found when Parsing the PDF
      * File. 
@@ -81,9 +79,8 @@ public class PDFParser extends BaseParse
      *
      * @throws IOException If there is an error initializing the stream.
      */
-    public PDFParser( InputStream input ) throws IOException
-    {
-        this(input, null);
+    public PDFParser( InputStream input ) throws IOException {
+        this(input, null, FORCE_PARSING);
     }
 
     /**
@@ -94,10 +91,8 @@ public class PDFParser extends BaseParse
      * @throws IOException If there is an error initializing the stream.
      */
     public PDFParser(InputStream input, RandomAccess rafi)
-        throws IOException
-    {
-        super(input);
-        this.raf = rafi;
+        throws IOException {
+        this(input, rafi, FORCE_PARSING);
     }
     
     /**
@@ -111,11 +106,9 @@ public class PDFParser extends BaseParse
      * @throws IOException If there is an error initializing the stream.
      */
     public PDFParser(InputStream input, RandomAccess rafi, boolean force)
-        throws IOException
-    {
-        super(input);
+        throws IOException {
+        super(input, force);
         this.raf = rafi;
-        this.forceParsing = force;
     }
 
     /**

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Thu Oct 14 09:05:39 2010
@@ -27,7 +27,6 @@ import java.util.NoSuchElementException;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSBoolean;
 import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNull;
 import org.apache.pdfbox.cos.COSNumber;
@@ -53,18 +52,34 @@ public class PDFStreamParser extends Bas
     /**
      * Constructor that takes a stream to parse.
      *
+     * @since Apache PDFBox 1.3.0
      * @param stream The stream to read data from.
      * @param raf The random access file.
-     *
+     * @param forceParcing flag to skip malformed or otherwise unparseable
+     *                     input where possible
      * @throws IOException If there is an error reading from the stream.
      */
-    public PDFStreamParser( InputStream stream, RandomAccess raf ) throws IOException
-    {
-        super( stream );
+    public PDFStreamParser(
+            InputStream stream, RandomAccess raf, boolean forceParsing)
+            throws IOException {
+        super(stream, forceParsing);
         file = raf;
     }
 
     /**
+     * Constructor that takes a stream to parse.
+     *
+     * @param stream The stream to read data from.
+     * @param raf The random access file.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public PDFStreamParser(InputStream stream, RandomAccess raf)
+            throws IOException {
+        this(stream, raf, FORCE_PARSING);
+    }
+
+    /**
      * Constructor.
      *
      * @param stream The stream to parse.
@@ -79,6 +94,20 @@ public class PDFStreamParser extends Bas
     /**
      * Constructor.
      *
+     * @since Apache PDFBox 1.3.0
+     * @param stream The stream to parse.
+     * @param forceParcing flag to skip malformed or otherwise unparseable
+     *                     input where possible
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFStreamParser(COSStream stream, boolean forceParsing)
+            throws IOException {
+       this(stream.getUnfilteredStream(), stream.getScratchFile(), forceParsing);
+    }
+
+    /**
+     * Constructor.
+     *
      * @param stream The stream to parse.
      *
      * @throws IOException If there is an error initializing the stream.

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java Thu Oct 14 09:05:39 2010
@@ -42,19 +42,35 @@ public class PDFXrefStreamParser extends
     /**
      * Constructor.
      *
+     * @since Apache PDFBox 1.3.0
      * @param strm The stream to parse.
      * @param doc The document for the current parsing.
-     *
+     * @param forceParcing flag to skip malformed or otherwise unparseable
+     *                     input where possible
      * @throws IOException If there is an error initializing the stream.
      */
-    public PDFXrefStreamParser(COSStream strm, COSDocument doc) throws IOException
-    {
-        super(strm.getUnfilteredStream());
+    public PDFXrefStreamParser(
+            COSStream strm, COSDocument doc, boolean forceParsing)
+            throws IOException {
+        super(strm.getUnfilteredStream(), forceParsing);
         setDocument(doc);
         stream = strm;
     }
 
     /**
+     * Constructor.
+     *
+     * @param strm The stream to parse.
+     * @param doc The document for the current parsing.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFXrefStreamParser(COSStream strm, COSDocument doc)
+            throws IOException {
+        this(strm, doc, false);
+    }
+
+    /**
      * Parses through the unfiltered stream and populates the xrefTable HashMap.
      * @throws IOException If there is an error while parsing the stream.
      */

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Thu Oct 14 09:05:39 2010
@@ -37,6 +37,7 @@ import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.exceptions.WrappedIOException;
 
+import org.apache.pdfbox.pdfparser.BaseParser;
 import org.apache.pdfbox.pdfparser.PDFStreamParser;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
@@ -89,7 +90,12 @@ public class PDFStreamEngine
     
     private int validCharCnt;
     private int totalCharCnt;
-    
+
+    /**
+     * Flag to skip malformed or otherwise unparseable input where possible.
+     */
+    private boolean forceParsing = false;
+
     /**
      * This is a simple internal class used by the Stream engine to handle the
      * resources stack.
@@ -163,7 +169,14 @@ public class PDFStreamEngine
         totalCharCnt = 0;
     }
 
-    
+    public boolean isForceParsing() {
+        return forceParsing;
+    }
+
+    public void setForceParsing(boolean forceParsing) {
+        this.forceParsing = forceParsing;
+    }
+
     /**
      * Register a custom operator processor with the engine.
      *
@@ -220,60 +233,52 @@ public class PDFStreamEngine
      *
      * @throws IOException If there is an exception while processing the stream.
      */
-    public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException
-    {
+    public void processSubStream(
+            PDPage aPage, PDResources resources, COSStream cosStream)
+            throws IOException {
         page = aPage;
-        PDFStreamParser parser = null;
-        if( resources != null )
-        {
+        if (resources != null) {
             StreamResources sr = new StreamResources();
             sr.fonts = resources.getFonts( documentFontCache );
             sr.colorSpaces = resources.getColorSpaces();
             sr.xobjects = resources.getXObjects();
             sr.graphicsStates = resources.getGraphicsStates();
             sr.resources = resources;
+
             streamResourcesStack.push(sr);
+            try {
+                processSubStream(cosStream);
+            } finally {
+                streamResourcesStack.pop();
+            }
+        } else {
+            processSubStream(cosStream);
         }
-        try
-        {
-            List<COSBase> arguments = new ArrayList<COSBase>();
-            
-            parser = new PDFStreamParser( cosStream );
+    }
+
+    private void processSubStream(COSStream cosStream) throws IOException {
+        List<COSBase> arguments = new ArrayList<COSBase>();
+        PDFStreamParser parser = new PDFStreamParser(cosStream, forceParsing);
+        try {
             Iterator<Object> iter = parser.getTokenIterator();
 
-            while( iter.hasNext() )
-            {
+            while (iter.hasNext()) {
                 Object next = iter.next();
-                if( next instanceof COSObject )
-                {
-                    arguments.add( ((COSObject)next).getObject() );
+                if (log.isDebugEnabled()) {
+                    log.debug("processing substream token: " + next);
                 }
-                else if( next instanceof PDFOperator )
-                {
-                    processOperator( (PDFOperator)next, arguments );
-                    arguments = new ArrayList();
-                }
-                else
-                {
-                    arguments.add( (COSBase)next );
+                if (next instanceof COSObject) {
+                    arguments.add(((COSObject) next).getObject());
+                } else if (next instanceof PDFOperator) {
+                    processOperator((PDFOperator) next, arguments);
+                    arguments = new ArrayList<COSBase>();
+                } else {
+                    arguments.add((COSBase) next);
                 }
-                if(log.isDebugEnabled())
-                {
-                    log.debug("token: " + next);
-                }
-            }
-        }
-        finally
-        {
-            if (parser != null) {
-                parser.close();
-            }
-            if( resources != null )
-            {
-                streamResourcesStack.pop();
             }
+        } finally {
+            parser.close();
         }
-
     }
 
     

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java Thu Oct 14 09:05:39 2010
@@ -115,7 +115,7 @@ public class TestPDFParser extends TestC
          */
         public TestParser( InputStream input) throws IOException
         {
-            super( input );
+            super( input, false );
         }
 
         /**

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Thu Oct 14 09:05:39 2010
@@ -230,20 +230,18 @@ public class TestTextStripper extends Te
             log.info("Preparing to parse " + inFile.getName() + " for standard test");
         }
 
-        OutputStream os = null;
-        Writer writer = null;
-        PDDocument document = null;
-        try
+        if (!outDir.exists()) 
         {
-            if (!outDir.exists()) 
+            if (!outDir.mkdirs()) 
             {
-                if (!outDir.mkdirs()) 
-                {
-                    throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
-                }
+                throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
             }
+        }
+
+        PDDocument document = PDDocument.load(inFile);
+        try
+        {
             
-            document = PDDocument.load(inFile);
             File outFile = null;
             File expectedFile = null;
 
@@ -258,20 +256,23 @@ public class TestTextStripper extends Te
                 expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
             }
 
-            os = new FileOutputStream(outFile);
-            os.write( 0xFF );
-            os.write( 0xFE );
-            writer = new OutputStreamWriter(os,encoding);
-
-            //Allows for sorted tests 
-            stripper.setSortByPosition(bSort);
-            stripper.writeText(document, writer);
-            // close the written file before reading it again
-            writer.close();
-            writer = null;
-            os.close();
-            os = null;
-            
+            OutputStream os = new FileOutputStream(outFile);
+            try {
+                os.write( 0xFF );
+                os.write( 0xFE );
+
+                Writer writer = new OutputStreamWriter(os, encoding);
+                try {
+                    //Allows for sorted tests 
+                    stripper.setSortByPosition(bSort);
+                    stripper.writeText(document, writer);
+                } finally {
+                    // close the written file before reading it again
+                    writer.close();
+                }
+            } finally {
+                os.close();
+            }
 
             if (bLogResult)
             {
@@ -327,10 +328,7 @@ public class TestTextStripper extends Te
         }
         finally
         {
-            if( document != null )
-            {
-                document.close();
-            }
+            document.close();
         }
     }