You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/04/05 18:11:19 UTC
svn commit: r1671405 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser:
BaseParser.java COSParser.java
Author: tilman
Date: Sun Apr 5 16:11:18 2015
New Revision: 1671405
URL: http://svn.apache.org/r1671405
Log:
PDFBOX-2576: refactor double code in parseCOSStream
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1671405&r1=1671404&r2=1671405&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Apr 5 16:11:18 2015
@@ -375,40 +375,7 @@ public abstract class BaseParser impleme
{
readExpectedString(STREAM_STRING);
- //PDF Ref 3.2.7 A stream must be followed by either
- //a CRLF or LF but nothing else.
-
- int whitespace = pdfSource.read();
-
- //see brother_scan_cover.pdf, it adds whitespaces
- //after the stream but before the start of the
- //data, so just read those first
- while (ASCII_SPACE == whitespace)
- {
- whitespace = pdfSource.read();
- }
-
- if( ASCII_CR == whitespace )
- {
- whitespace = pdfSource.read();
- if( ASCII_LF != whitespace )
- {
- pdfSource.unread( whitespace );
- //The spec says this is invalid but it happens in the real
- //world so we must support it.
- }
- }
- else if (ASCII_LF == whitespace)
- {
- //that is fine
- }
- else
- {
- //we are in an error.
- //but again we will do a lenient parsing and just assume that everything
- //is fine
- pdfSource.unread( whitespace );
- }
+ skipWhiteSpaces();
// This needs to be dic.getItem because when we are parsing, the underlying object
// might still be null.
@@ -557,6 +524,40 @@ public abstract class BaseParser impleme
return stream;
}
+ protected void skipWhiteSpaces() throws IOException
+ {
+ //PDF Ref 3.2.7 A stream must be followed by either
+ //a CRLF or LF but nothing else.
+
+ int whitespace = pdfSource.read();
+
+ //see brother_scan_cover.pdf, it adds whitespaces
+ //after the stream but before the start of the
+ //data, so just read those first
+ while (ASCII_SPACE == whitespace)
+ {
+ whitespace = pdfSource.read();
+ }
+
+ if (ASCII_CR == whitespace)
+ {
+ whitespace = pdfSource.read();
+ if (ASCII_LF != whitespace)
+ {
+ pdfSource.unread(whitespace);
+ //The spec says this is invalid but it happens in the real
+ //world so we must support it.
+ }
+ }
+ else if (ASCII_LF != whitespace)
+ {
+ //we are in an error.
+ //but again we will do a lenient parsing and just assume that everything
+ //is fine
+ pdfSource.unread(whitespace);
+ }
+ }
+
/**
* This method will read through the current stream object until
* we find the keyword "endstream" meaning we're at the end of this
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1671405&r1=1671404&r2=1671405&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Apr 5 16:11:18 2015
@@ -891,36 +891,8 @@ public class COSParser extends BaseParse
{
// read 'stream'; this was already tested in parseObjectsDynamically()
readString();
- // skip whitespaces before start of data
- // PDF Ref 1.7, chap. 3.2.7:
- // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF
- // but nothing else.
- int whitespace = pdfSource.read();
- // see brother_scan_cover.pdf, it adds whitespaces
- // after the stream but before the start of the
- // data, so just read those first
- while (whitespace == 0x20)
- {
- whitespace = pdfSource.read();
- }
-
- if (whitespace == 0x0D)
- {
- whitespace = pdfSource.read();
- if (whitespace != 0x0A)
- {
- // the spec says this is invalid but it happens in the
- // real world so we must support it
- pdfSource.unread(whitespace);
- }
- }
- else if (whitespace != 0x0A)
- {
- // no whitespace after 'stream'; PDF ref. says 'should' so
- // that is ok
- pdfSource.unread(whitespace);
- }
+ skipWhiteSpaces();
/*
* This needs to be dic.getItem because when we are parsing, the underlying object might still be null.