You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/05/17 19:36:24 UTC
svn commit: r1595520 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Author: tilman
Date: Sat May 17 17:36:24 2014
New Revision: 1595520
URL: http://svn.apache.org/r1595520
Log:
PDFBOX-1922: read version header in non sequential parser
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1595520&r1=1595519&r2=1595520&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sat May 17 17:36:24 2014
@@ -81,6 +81,10 @@ import org.apache.pdfbox.persistence.uti
*/
public class NonSequentialPDFParser extends PDFParser
{
+ private static final String PDF_HEADER = "%PDF-";
+ private static final String FDF_HEADER = "%FDF-";
+ private static final String PDF_DEFAULT_VERSION = "1.4";
+ private static final String FDF_DEFAULT_VERSION = "1.0";
private static final byte[] XREF = new byte[] { 'x', 'r', 'e', 'f' };
@@ -733,6 +737,11 @@ public class NonSequentialPDFParser exte
try
{
+ // PDFBOX-1922 read the version header and rewind
+ // this part copied from the sequential parser
+ parseHeader();
+ pdfSource.seek(0);
+
if (!initialParseDone)
{
initialParse();
@@ -1866,5 +1875,109 @@ public class NonSequentialPDFParser exte
}
}
}
+
+ private void parseHeader() throws IOException
+ {
+ // read first line
+ String header = readLine();
+ // some pdf-documents are broken and the pdf-version is in one of the following lines
+ if ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+ {
+ header = readLine();
+ while ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+ {
+ // if a line starts with a digit, it has to be the first one with data in it
+ if ((header.length() > 0) && (Character.isDigit(header.charAt(0))))
+ {
+ break;
+ }
+ header = readLine();
+ }
+ }
+
+ // nothing found
+ if ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+ {
+ throw new IOException("Error: Header doesn't contain versioninfo");
+ }
+
+ //sometimes there are some garbage bytes in the header before the header
+ //actually starts, so lets try to find the header first.
+ int headerStart = header.indexOf(PDF_HEADER);
+ if (headerStart == -1)
+ {
+ headerStart = header.indexOf(FDF_HEADER);
+ }
+
+ //greater than zero because if it is zero then
+ //there is no point of trimming
+ if (headerStart > 0)
+ {
+ //trim off any leading characters
+ header = header.substring(headerStart, header.length());
+ }
+
+ /*
+ * This is used if there is garbage after the header on the same line
+ */
+ if (header.startsWith(PDF_HEADER))
+ {
+ if (!header.matches(PDF_HEADER + "\\d.\\d"))
+ {
+
+ if (header.length() < PDF_HEADER.length() + 3)
+ {
+ // No version number at all, set to 1.4 as default
+ header = PDF_HEADER + PDF_DEFAULT_VERSION;
+ LOG.debug("No pdf version found, set to " + PDF_DEFAULT_VERSION + " as default.");
+ }
+ else
+ {
+ String headerGarbage = header.substring(PDF_HEADER.length() + 3, header.length()) + "\n";
+ header = header.substring(0, PDF_HEADER.length() + 3);
+ pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
+ }
+ }
+ }
+ else
+ {
+ if (!header.matches(FDF_HEADER + "\\d.\\d"))
+ {
+ if (header.length() < FDF_HEADER.length() + 3)
+ {
+ // No version number at all, set to 1.0 as default
+ header = FDF_HEADER + FDF_DEFAULT_VERSION;
+ LOG.debug("No fdf version found, set to " + FDF_DEFAULT_VERSION + " as default.");
+ }
+ else
+ {
+ String headerGarbage = header.substring(FDF_HEADER.length() + 3, header.length()) + "\n";
+ header = header.substring(0, FDF_HEADER.length() + 3);
+ pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
+ }
+ }
+ }
+ document.setHeaderString(header);
+
+ try
+ {
+ if (header.startsWith(PDF_HEADER))
+ {
+ float pdfVersion = Float.parseFloat(
+ header.substring(PDF_HEADER.length(), Math.min(header.length(), PDF_HEADER.length() + 3)));
+ document.setVersion(pdfVersion);
+ }
+ else
+ {
+ float pdfVersion = Float.parseFloat(
+ header.substring(FDF_HEADER.length(), Math.min(header.length(), FDF_HEADER.length() + 3)));
+ document.setVersion(pdfVersion);
+ }
+ }
+ catch (NumberFormatException e)
+ {
+ throw new IOException("Error getting pdf version:" + e);
+ }
+ }
}