You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2011/12/01 16:56:44 UTC
svn commit: r1209127 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Author: lehmi
Date: Thu Dec 1 15:56:44 2011
New Revision: 1209127
URL: http://svn.apache.org/viewvc?rev=1209127&view=rev
Log:
PDFBOX-1171: added improved hexstring handling as proposed by Timo Boehme, removed hexstring stuff from parseCOSString
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1209127&r1=1209126&r2=1209127&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Dec 1 15:56:44 2011
@@ -692,8 +692,7 @@ public abstract class BaseParser
}
else if( nextChar == '<' )
{
- openBrace = '<';
- closeBrace = '>';
+ return parseCOSHexString();
}
else
{
@@ -731,115 +730,105 @@ public abstract class BaseParser
char next = (char)pdfSource.read();
switch(next)
{
- case 'n':
- retval.append( '\n' );
- break;
- case 'r':
- retval.append( '\r' );
- break;
- case 't':
- retval.append( '\t' );
- break;
- case 'b':
- retval.append( '\b' );
- break;
- case 'f':
- retval.append( '\f' );
- break;
- case ')':
- // PDFBox 276 /Title (c:\)
- braces = checkForMissingCloseParen(braces);
- if( braces != 0 )
- {
+ case 'n':
+ retval.append( '\n' );
+ break;
+ case 'r':
+ retval.append( '\r' );
+ break;
+ case 't':
+ retval.append( '\t' );
+ break;
+ case 'b':
+ retval.append( '\b' );
+ break;
+ case 'f':
+ retval.append( '\f' );
+ break;
+ case ')':
+ // PDFBox 276 /Title (c:\)
+ braces = checkForMissingCloseParen(braces);
+ if( braces != 0 )
+ {
+ retval.append( next );
+ }
+ else
+ {
+ retval.append('\\');
+ }
+ break;
+ case '(':
+ case '\\':
retval.append( next );
- }
- else
- {
- retval.append('\\');
- }
- break;
- case '(':
- case '\\':
- retval.append( next );
- break;
- case 10:
- case 13:
- //this is a break in the line so ignore it and the newline and continue
- c = pdfSource.read();
- while( isEOL(c) && c != -1)
- {
+ break;
+ case 10:
+ case 13:
+ //this is a break in the line so ignore it and the newline and continue
c = pdfSource.read();
- }
- nextc = c;
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- {
- StringBuffer octal = new StringBuffer();
- octal.append( next );
- c = pdfSource.read();
- char digit = (char)c;
- if( digit >= '0' && digit <= '7' )
+ while( isEOL(c) && c != -1)
+ {
+ c = pdfSource.read();
+ }
+ nextc = c;
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
{
- octal.append( digit );
+ StringBuffer octal = new StringBuffer();
+ octal.append( next );
c = pdfSource.read();
- digit = (char)c;
+ char digit = (char)c;
if( digit >= '0' && digit <= '7' )
{
octal.append( digit );
+ c = pdfSource.read();
+ digit = (char)c;
+ if( digit >= '0' && digit <= '7' )
+ {
+ octal.append( digit );
+ }
+ else
+ {
+ nextc = c;
+ }
}
else
{
nextc = c;
}
+
+ int character = 0;
+ try
+ {
+ character = Integer.parseInt( octal.toString(), 8 );
+ }
+ catch( NumberFormatException e )
+ {
+ throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
+ }
+ retval.append( character );
+ break;
}
- else
- {
- nextc = c;
- }
-
- int character = 0;
- try
- {
- character = Integer.parseInt( octal.toString(), 8 );
- }
- catch( NumberFormatException e )
+ default:
{
- throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
+ retval.append( '\\' );
+ retval.append( next );
+ //another problem with PDF's, sometimes the \ doesn't really
+ //mean escape like the PDF spec says it does, sometimes is should be literal
+ //which is what we will assume here.
+ //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
}
- retval.append( character );
- break;
- }
- default:
- {
- retval.append( '\\' );
- retval.append( next );
- //another ficken problem with PDF's, sometimes the \ doesn't really
- //mean escape like the PDF spec says it does, sometimes is should be literal
- //which is what we will assume here.
- //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
- }
}
}
else
{
- if( openBrace == '<' )
- {
- if( isHexDigit(ch) )
- {
- retval.append( ch );
- }
- }
- else
- {
- retval.append( ch );
- }
+ retval.append( ch );
}
if (nextc != -2)
{
@@ -854,15 +843,58 @@ public abstract class BaseParser
{
pdfSource.unread(c);
}
- if( openBrace == '<' )
- {
- retval = COSString.createFromHexString(
- retval.getString(), forceParsing);
- }
return retval;
}
/**
+ * This will parse a PDF HEX string with fail fast semantic
+ * meaning that we stop if a not allowed character is found.
+ * This is necessary in order to detect malformed input and
+ * be able to skip to next object start.
+ *
+ * We assume starting '<' was already read.
+ *
+ * @return The parsed PDF string.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ private final COSString parseCOSHexString() throws IOException
+ {
+
+ final StringBuilder sBuf = new StringBuilder();
+
+ while( true )
+ {
+ int c = pdfSource.read();
+
+ if ( isHexDigit((char)c) )
+ {
+ sBuf.append( (char) c );
+ }
+ else if ( c == '>' )
+ {
+ break;
+ }
+ else if ( c < 0 )
+ {
+ throw new IOException( "Missing closing bracket for hex string. Reached EOS." );
+ }
+ else if ( ( c == ' ' ) || ( c == '\n' ) ||
+ ( c == '\t' ) || ( c == '\r' ) ||
+ ( c == '\b' ) || ( c == '\f' ) )
+ {
+ continue;
+ }
+ else
+ {
+ // character is neither a hex char nor end of string not EOS nor whitespace
+ throw new IOException( "Not allowed character in hex string; char code: " + c );
+ }
+ }
+ return COSString.createFromHexString( sBuf.toString(), forceParsing );
+ }
+
+ /**
* This will parse a PDF array object.
*
* @return The parsed PDF array.