You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2010/10/14 11:05:40 UTC
svn commit: r1022431 - in /pdfbox/trunk/pdfbox/src:
main/java/org/apache/pdfbox/ main/java/org/apache/pdfbox/cos/
main/java/org/apache/pdfbox/pdfparser/ main/java/org/apache/pdfbox/util/
test/java/org/apache/pdfbox/pdfparser/ test/java/org/apache/pdfbo...
Author: jukka
Date: Thu Oct 14 09:05:39 2010
New Revision: 1022431
URL: http://svn.apache.org/viewvc?rev=1022431&view=rev
Log:
PDFBOX-789: Error by text extraction
Extend the forceParsing flag to cover hex strings and all kinds of stream parsers.
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java Thu Oct 14 09:05:39 2010
@@ -229,6 +229,7 @@ public class ExtractText
{
stripper = new PDFTextStripper(encoding);
}
+ stripper.setForceParsing( force );
stripper.setSortByPosition( sort );
stripper.setShouldSeparateByBeads( separateBeads );
stripper.setStartPage( startPage );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Thu Oct 14 09:05:39 2010
@@ -72,23 +72,62 @@ public class COSDocument extends COSBase
/**
* This file will store the streams in order to conserve memory.
*/
- private RandomAccess scratchFile = null;
+ private final RandomAccess scratchFile;
- private File tmpFile = null;
+ private final File tmpFile;
private String headerString = "%PDF-1.4";
private boolean warnMissingClose = true;
+ private boolean closed = false;
+
+ /**
+ * Flag to skip malformed or otherwise unparseable input where possible.
+ */
+ private final boolean forceParsing;
+
+ /**
+ * Constructor that will use the given random access file for storage
+ * of the PDF streams. The client of this method is responsible for
+ * deleting the storage if necessary that this file will write to. The
+ * close method will close the file though.
+ *
+ * @param scratchFile the random access file to use for storage
+ * @param forceParsing flag to skip malformed or otherwise unparseable
+ * document content where possible
+ */
+ public COSDocument(RandomAccess scratchFile, boolean forceParsing) {
+ this.scratchFile = scratchFile;
+ this.tmpFile = null;
+ this.forceParsing = forceParsing;
+ }
+
+ /**
+ * Constructor that will use a temporary file in the given directory
+ * for storage of the PDF streams. The temporary file is automatically
+ * removed when this document gets closed.
+ *
+ * @param scratchDir directory for the temporary file,
+ * or <code>null</code> to use the system default
+ * @param forceParsing flag to skip malformed or otherwise unparseable
+ * document content where possible
+ */
+ public COSDocument(File scratchDir, boolean forceParsing)
+ throws IOException {
+ this.tmpFile = File.createTempFile("pdfbox-", ".tmp", scratchDir);
+ this.scratchFile = new RandomAccessFile(tmpFile, "rw");
+ this.forceParsing = forceParsing;
+ }
+
/**
* Constructor. Uses the java.io.tmpdir value to create a file
* to store the streams.
*
* @throws IOException If there is an error creating the tmp file.
*/
- public COSDocument() throws IOException
- {
- this( new File( System.getProperty( "java.io.tmpdir" ) ) );
+ public COSDocument() throws IOException {
+ this((File) null);
}
/**
@@ -99,10 +138,8 @@ public class COSDocument extends COSBase
*
* @throws IOException If there is an error creating the tmp file.
*/
- public COSDocument( File scratchDir ) throws IOException
- {
- tmpFile = File.createTempFile( "pdfbox", "tmp", scratchDir );
- scratchFile = new RandomAccessFile( tmpFile, "rw" );
+ public COSDocument(File scratchDir) throws IOException {
+ this(scratchDir, false);
}
/**
@@ -113,9 +150,8 @@ public class COSDocument extends COSBase
*
* @param file The random access file to use for storage.
*/
- public COSDocument( RandomAccess file )
- {
- scratchFile = file;
+ public COSDocument(RandomAccess file) {
+ this(file, false);
}
/**
@@ -379,15 +415,12 @@ public class COSDocument extends COSBase
*/
public void close() throws IOException
{
- if( scratchFile != null )
- {
+ if (!closed) {
scratchFile.close();
- scratchFile = null;
- }
- if( tmpFile != null )
- {
- tmpFile.delete();
- tmpFile = null;
+ if (tmpFile != null) {
+ tmpFile.delete();
+ }
+ closed = true;
}
}
@@ -399,12 +432,12 @@ public class COSDocument extends COSBase
*/
protected void finalize() throws IOException
{
- if( this.warnMissingClose && ( tmpFile != null || scratchFile != null ) )
- {
- Throwable t = new Throwable( "Warning: You did not close the PDF Document" );
- t.printStackTrace();
+ if (!closed) {
+ if (warnMissingClose) {
+ log.warn( "Warning: You did not close a PDF Document" );
+ }
+ close();
}
- close();
}
/**
@@ -445,7 +478,8 @@ public class COSDocument extends COSBase
for( COSObject objStream : getObjectsByType( "ObjStm" ) )
{
COSStream stream = (COSStream)objStream.getObject();
- PDFObjectStreamParser parser = new PDFObjectStreamParser( stream, this );
+ PDFObjectStreamParser parser =
+ new PDFObjectStreamParser(stream, this, forceParsing);
parser.parse();
for( COSObject next : parser.getObjects() )
{
@@ -524,7 +558,8 @@ public class COSDocument extends COSBase
{
COSStream stream = (COSStream)xrefStream.getObject();
trailerDict.addAll(stream);
- PDFXrefStreamParser parser = new PDFXrefStreamParser(stream, this);
+ PDFXrefStreamParser parser =
+ new PDFXrefStreamParser(stream, this, forceParsing);
parser.parse();
}
setTrailer( trailerDict );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java Thu Oct 14 09:05:39 2010
@@ -172,8 +172,22 @@ public class COSString extends COSBase
* @return A cos string with the hex characters converted to their actual bytes.
* @throws IOException If there is an error with the hex string.
*/
- public static COSString createFromHexString( String hex ) throws IOException
- {
+ public static COSString createFromHexString(String hex)
+ throws IOException {
+ return createFromHexString(hex, false);
+ }
+
+ /**
+ * Creates a COS string from a string of hex characters, optionally
+ * ignoring malformed input.
+ *
+ * @param hex A hex string.
+ * @param force flag to ignore malformed input
+ * @return A cos string with the hex characters converted to their actual bytes.
+ * @throws IOException If there is an error with the hex string.
+ */
+ public static COSString createFromHexString(String hex, boolean force)
+ throws IOException {
COSString retval = new COSString();
StringBuilder hexBuffer = new StringBuilder( hex.trim() );
//if odd number then the last hex digit is assumed to be 0
@@ -182,16 +196,19 @@ public class COSString extends COSBase
hexBuffer.append( '0' );
}
int length = hexBuffer.length();
- for( int i=0; i<length;)
- {
- String hexChars = String.valueOf(hexBuffer.charAt( i++ )) + hexBuffer.charAt( i++ );
- try
- {
- retval.append( Integer.parseInt( hexChars, 16 ) );
- }
- catch( NumberFormatException e )
- {
- throw new IOException( "Error: Expected hex number, actual='" + hexChars + "'" );
+ for (int i = 0; i < length; i += 2) {
+ try {
+ retval.append(
+ Integer.parseInt(hexBuffer.substring(i, i + 2), 16));
+ } catch (NumberFormatException e) {
+ if (force) {
+ retval.append('?');
+ } else {
+ IOException exception =
+ new IOException("Invalid hex string: " + hex);
+ exception.initCause(e);
+ throw exception;
+ }
}
}
return retval;
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Oct 14 09:05:39 2010
@@ -17,13 +17,13 @@
package org.apache.pdfbox.pdfparser;
import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.pdfbox.io.ByteArrayPushBackInputStream;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
@@ -90,6 +90,12 @@ public abstract class BaseParser
public static final String DEF = "def";
/**
+ * Default value of the {@link #forceParsing} flag.
+ */
+ protected static final boolean FORCE_PARSING =
+ Boolean.getBoolean("org.apache.pdfbox.forceParsing");
+
+ /**
* This is the stream that will be read from.
*/
protected PushBackInputStream pdfSource;
@@ -98,30 +104,46 @@ public abstract class BaseParser
* This is the document that will be parsed.
*/
protected COSDocument document;
-
+
+ /**
+ * Flag to skip malformed or otherwise unparseable input where possible.
+ */
+ protected final boolean forceParsing;
+
/**
* Constructor.
*
+ * @since Apache PDFBox 1.3.0
* @param input The input stream to read the data from.
+ * @param forceParcing flag to skip malformed or otherwise unparseable
+ * input where possible
+ * @throws IOException If there is an error reading the input stream.
+ */
+ public BaseParser(InputStream input, boolean forceParsing)
+ throws IOException {
+ this.pdfSource = new PushBackInputStream(
+ new BufferedInputStream(input, 16384), 4096);
+ this.forceParsing = forceParsing;
+ }
+
+ /**
+ * Constructor.
*
+ * @param input The input stream to read the data from.
* @throws IOException If there is an error reading the input stream.
*/
- public BaseParser( InputStream input) throws IOException
- {
- //pdfSource = new PushBackByteArrayStream( input );
- pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
+ public BaseParser(InputStream input) throws IOException {
+ this(input, FORCE_PARSING);
}
/**
* Constructor.
*
* @param input The array to read the data from.
- *
* @throws IOException If there is an error reading the byte data.
*/
- protected BaseParser(byte[] input) throws IOException
- {
- pdfSource = new ByteArrayPushBackInputStream(input);
+ protected BaseParser(byte[] input) throws IOException {
+ this(new ByteArrayInputStream(input));
}
/**
@@ -747,7 +769,8 @@ public abstract class BaseParser
}
if( openBrace == '<' )
{
- retval = COSString.createFromHexString( retval.getString() );
+ retval = COSString.createFromHexString(
+ retval.getString(), forceParsing);
}
return retval;
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Thu Oct 14 09:05:39 2010
@@ -50,19 +50,35 @@ public class PDFObjectStreamParser exten
/**
* Constructor.
*
+ * @since Apache PDFBox 1.3.0
* @param strm The stream to parse.
* @param doc The document for the current parsing.
- *
+ * @param forceParcing flag to skip malformed or otherwise unparseable
+ * input where possible
* @throws IOException If there is an error initializing the stream.
*/
- public PDFObjectStreamParser( COSStream strm, COSDocument doc ) throws IOException
- {
- super( strm.getUnfilteredStream() );
+ public PDFObjectStreamParser(
+ COSStream strm, COSDocument doc, boolean forceParsing)
+ throws IOException {
+ super(strm.getUnfilteredStream(), forceParsing);
setDocument( doc );
stream = strm;
}
/**
+ * Constructor.
+ *
+ * @param strm The stream to parse.
+ * @param doc The document for the current parsing.
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFObjectStreamParser(COSStream strm, COSDocument doc)
+ throws IOException {
+ this(strm, doc, FORCE_PARSING);
+ }
+
+ /**
* This will parse the tokens in the stream. This will close the
* stream when it is finished parsing.
*
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Thu Oct 14 09:05:39 2010
@@ -59,8 +59,6 @@ public class PDFParser extends BaseParse
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
- private boolean forceParsing = false;
-
/**
* A list of duplicate objects found when Parsing the PDF
* File.
@@ -81,9 +79,8 @@ public class PDFParser extends BaseParse
*
* @throws IOException If there is an error initializing the stream.
*/
- public PDFParser( InputStream input ) throws IOException
- {
- this(input, null);
+ public PDFParser( InputStream input ) throws IOException {
+ this(input, null, FORCE_PARSING);
}
/**
@@ -94,10 +91,8 @@ public class PDFParser extends BaseParse
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser(InputStream input, RandomAccess rafi)
- throws IOException
- {
- super(input);
- this.raf = rafi;
+ throws IOException {
+ this(input, rafi, FORCE_PARSING);
}
/**
@@ -111,11 +106,9 @@ public class PDFParser extends BaseParse
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser(InputStream input, RandomAccess rafi, boolean force)
- throws IOException
- {
- super(input);
+ throws IOException {
+ super(input, force);
this.raf = rafi;
- this.forceParsing = force;
}
/**
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Thu Oct 14 09:05:39 2010
@@ -27,7 +27,6 @@ import java.util.NoSuchElementException;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
@@ -53,18 +52,34 @@ public class PDFStreamParser extends Bas
/**
* Constructor that takes a stream to parse.
*
+ * @since Apache PDFBox 1.3.0
* @param stream The stream to read data from.
* @param raf The random access file.
- *
+ * @param forceParcing flag to skip malformed or otherwise unparseable
+ * input where possible
* @throws IOException If there is an error reading from the stream.
*/
- public PDFStreamParser( InputStream stream, RandomAccess raf ) throws IOException
- {
- super( stream );
+ public PDFStreamParser(
+ InputStream stream, RandomAccess raf, boolean forceParsing)
+ throws IOException {
+ super(stream, forceParsing);
file = raf;
}
/**
+ * Constructor that takes a stream to parse.
+ *
+ * @param stream The stream to read data from.
+ * @param raf The random access file.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public PDFStreamParser(InputStream stream, RandomAccess raf)
+ throws IOException {
+ this(stream, raf, FORCE_PARSING);
+ }
+
+ /**
* Constructor.
*
* @param stream The stream to parse.
@@ -79,6 +94,20 @@ public class PDFStreamParser extends Bas
/**
* Constructor.
*
+ * @since Apache PDFBox 1.3.0
+ * @param stream The stream to parse.
+ * @param forceParcing flag to skip malformed or otherwise unparseable
+ * input where possible
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFStreamParser(COSStream stream, boolean forceParsing)
+ throws IOException {
+ this(stream.getUnfilteredStream(), stream.getScratchFile(), forceParsing);
+ }
+
+ /**
+ * Constructor.
+ *
* @param stream The stream to parse.
*
* @throws IOException If there is an error initializing the stream.
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFXrefStreamParser.java Thu Oct 14 09:05:39 2010
@@ -42,19 +42,35 @@ public class PDFXrefStreamParser extends
/**
* Constructor.
*
+ * @since Apache PDFBox 1.3.0
* @param strm The stream to parse.
* @param doc The document for the current parsing.
- *
+ * @param forceParcing flag to skip malformed or otherwise unparseable
+ * input where possible
* @throws IOException If there is an error initializing the stream.
*/
- public PDFXrefStreamParser(COSStream strm, COSDocument doc) throws IOException
- {
- super(strm.getUnfilteredStream());
+ public PDFXrefStreamParser(
+ COSStream strm, COSDocument doc, boolean forceParsing)
+ throws IOException {
+ super(strm.getUnfilteredStream(), forceParsing);
setDocument(doc);
stream = strm;
}
/**
+ * Constructor.
+ *
+ * @param strm The stream to parse.
+ * @param doc The document for the current parsing.
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFXrefStreamParser(COSStream strm, COSDocument doc)
+ throws IOException {
+ this(strm, doc, false);
+ }
+
+ /**
* Parses through the unfiltered stream and populates the xrefTable HashMap.
* @throws IOException If there is an error while parsing the stream.
*/
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Thu Oct 14 09:05:39 2010
@@ -37,6 +37,7 @@ import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.WrappedIOException;
+import org.apache.pdfbox.pdfparser.BaseParser;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
@@ -89,7 +90,12 @@ public class PDFStreamEngine
private int validCharCnt;
private int totalCharCnt;
-
+
+ /**
+ * Flag to skip malformed or otherwise unparseable input where possible.
+ */
+ private boolean forceParsing = false;
+
/**
* This is a simple internal class used by the Stream engine to handle the
* resources stack.
@@ -163,7 +169,14 @@ public class PDFStreamEngine
totalCharCnt = 0;
}
-
+ public boolean isForceParsing() {
+ return forceParsing;
+ }
+
+ public void setForceParsing(boolean forceParsing) {
+ this.forceParsing = forceParsing;
+ }
+
/**
* Register a custom operator processor with the engine.
*
@@ -220,60 +233,52 @@ public class PDFStreamEngine
*
* @throws IOException If there is an exception while processing the stream.
*/
- public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException
- {
+ public void processSubStream(
+ PDPage aPage, PDResources resources, COSStream cosStream)
+ throws IOException {
page = aPage;
- PDFStreamParser parser = null;
- if( resources != null )
- {
+ if (resources != null) {
StreamResources sr = new StreamResources();
sr.fonts = resources.getFonts( documentFontCache );
sr.colorSpaces = resources.getColorSpaces();
sr.xobjects = resources.getXObjects();
sr.graphicsStates = resources.getGraphicsStates();
sr.resources = resources;
+
streamResourcesStack.push(sr);
+ try {
+ processSubStream(cosStream);
+ } finally {
+ streamResourcesStack.pop();
+ }
+ } else {
+ processSubStream(cosStream);
}
- try
- {
- List<COSBase> arguments = new ArrayList<COSBase>();
-
- parser = new PDFStreamParser( cosStream );
+ }
+
+ private void processSubStream(COSStream cosStream) throws IOException {
+ List<COSBase> arguments = new ArrayList<COSBase>();
+ PDFStreamParser parser = new PDFStreamParser(cosStream, forceParsing);
+ try {
Iterator<Object> iter = parser.getTokenIterator();
- while( iter.hasNext() )
- {
+ while (iter.hasNext()) {
Object next = iter.next();
- if( next instanceof COSObject )
- {
- arguments.add( ((COSObject)next).getObject() );
+ if (log.isDebugEnabled()) {
+ log.debug("processing substream token: " + next);
}
- else if( next instanceof PDFOperator )
- {
- processOperator( (PDFOperator)next, arguments );
- arguments = new ArrayList();
- }
- else
- {
- arguments.add( (COSBase)next );
+ if (next instanceof COSObject) {
+ arguments.add(((COSObject) next).getObject());
+ } else if (next instanceof PDFOperator) {
+ processOperator((PDFOperator) next, arguments);
+ arguments = new ArrayList<COSBase>();
+ } else {
+ arguments.add((COSBase) next);
}
- if(log.isDebugEnabled())
- {
- log.debug("token: " + next);
- }
- }
- }
- finally
- {
- if (parser != null) {
- parser.close();
- }
- if( resources != null )
- {
- streamResourcesStack.pop();
}
+ } finally {
+ parser.close();
}
-
}
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java Thu Oct 14 09:05:39 2010
@@ -115,7 +115,7 @@ public class TestPDFParser extends TestC
*/
public TestParser( InputStream input) throws IOException
{
- super( input );
+ super( input, false );
}
/**
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1022431&r1=1022430&r2=1022431&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Thu Oct 14 09:05:39 2010
@@ -230,20 +230,18 @@ public class TestTextStripper extends Te
log.info("Preparing to parse " + inFile.getName() + " for standard test");
}
- OutputStream os = null;
- Writer writer = null;
- PDDocument document = null;
- try
+ if (!outDir.exists())
{
- if (!outDir.exists())
+ if (!outDir.mkdirs())
{
- if (!outDir.mkdirs())
- {
- throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
- }
+ throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
}
+ }
+
+ PDDocument document = PDDocument.load(inFile);
+ try
+ {
- document = PDDocument.load(inFile);
File outFile = null;
File expectedFile = null;
@@ -258,20 +256,23 @@ public class TestTextStripper extends Te
expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
}
- os = new FileOutputStream(outFile);
- os.write( 0xFF );
- os.write( 0xFE );
- writer = new OutputStreamWriter(os,encoding);
-
- //Allows for sorted tests
- stripper.setSortByPosition(bSort);
- stripper.writeText(document, writer);
- // close the written file before reading it again
- writer.close();
- writer = null;
- os.close();
- os = null;
-
+ OutputStream os = new FileOutputStream(outFile);
+ try {
+ os.write( 0xFF );
+ os.write( 0xFE );
+
+ Writer writer = new OutputStreamWriter(os, encoding);
+ try {
+ //Allows for sorted tests
+ stripper.setSortByPosition(bSort);
+ stripper.writeText(document, writer);
+ } finally {
+ // close the written file before reading it again
+ writer.close();
+ }
+ } finally {
+ os.close();
+ }
if (bLogResult)
{
@@ -327,10 +328,7 @@ public class TestTextStripper extends Te
}
finally
{
- if( document != null )
- {
- document.close();
- }
+ document.close();
}
}