You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2014/05/17 19:36:24 UTC
svn commit: r1595520 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Author: tilman
Date: Sat May 17 17:36:24 2014
New Revision: 1595520

URL: http://svn.apache.org/r1595520
Log:
PDFBOX-1922: read version header in non sequential parser

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1595520&r1=1595519&r2=1595520&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sat May 17 17:36:24 2014
@@ -81,6 +81,10 @@ import org.apache.pdfbox.persistence.uti
  */
 public class NonSequentialPDFParser extends PDFParser
 {
+    private static final String PDF_HEADER = "%PDF-";
+    private static final String FDF_HEADER = "%FDF-";
+    private static final String PDF_DEFAULT_VERSION = "1.4";
+    private static final String FDF_DEFAULT_VERSION = "1.0";
 
 	private static final byte[] XREF = new byte[] { 'x', 'r', 'e', 'f' };
 
@@ -733,6 +737,11 @@ public class NonSequentialPDFParser exte
 
         try
         {
+            // PDFBOX-1922 read the version header and rewind
+            // this part copied from the sequential parser
+            parseHeader();
+            pdfSource.seek(0);
+            
             if (!initialParseDone)
             {
                 initialParse();
@@ -1866,5 +1875,109 @@ public class NonSequentialPDFParser exte
     		}
     	}
     }
+    
+    private void parseHeader() throws IOException
+    {
+        // read first line
+        String header = readLine();
+        // some pdf-documents are broken and the pdf-version is in one of the following lines
+        if ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+        {
+            header = readLine();
+            while ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+            {
+                // if a line starts with a digit, it has to be the first one with data in it
+                if ((header.length() > 0) && (Character.isDigit(header.charAt(0))))
+                {
+                    break;
+                }
+                header = readLine();
+            }
+        }
+
+        // nothing found
+        if ((header.indexOf(PDF_HEADER) == -1) && (header.indexOf(FDF_HEADER) == -1))
+        {
+            throw new IOException("Error: Header doesn't contain versioninfo");
+        }
+
+        //sometimes there are some garbage bytes in the header before the header
+        //actually starts, so lets try to find the header first.
+        int headerStart = header.indexOf(PDF_HEADER);
+        if (headerStart == -1)
+        {
+            headerStart = header.indexOf(FDF_HEADER);
+        }
+
+        //greater than zero because if it is zero then
+        //there is no point of trimming
+        if (headerStart > 0)
+        {
+            //trim off any leading characters
+            header = header.substring(headerStart, header.length());
+        }
+
+        /*
+         * This is used if there is garbage after the header on the same line
+         */
+        if (header.startsWith(PDF_HEADER))
+        {
+            if (!header.matches(PDF_HEADER + "\\d.\\d"))
+            {
+
+                if (header.length() < PDF_HEADER.length() + 3)
+                {
+                    // No version number at all, set to 1.4 as default
+                    header = PDF_HEADER + PDF_DEFAULT_VERSION;
+                    LOG.debug("No pdf version found, set to " + PDF_DEFAULT_VERSION + " as default.");
+                }
+                else
+                {
+                    String headerGarbage = header.substring(PDF_HEADER.length() + 3, header.length()) + "\n";
+                    header = header.substring(0, PDF_HEADER.length() + 3);
+                    pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
+                }
+            }
+        }
+        else
+        {
+            if (!header.matches(FDF_HEADER + "\\d.\\d"))
+            {
+                if (header.length() < FDF_HEADER.length() + 3)
+                {
+                    // No version number at all, set to 1.0 as default
+                    header = FDF_HEADER + FDF_DEFAULT_VERSION;
+                    LOG.debug("No fdf version found, set to " + FDF_DEFAULT_VERSION + " as default.");
+                }
+                else
+                {
+                    String headerGarbage = header.substring(FDF_HEADER.length() + 3, header.length()) + "\n";
+                    header = header.substring(0, FDF_HEADER.length() + 3);
+                    pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
+                }
+            }
+        }
+        document.setHeaderString(header);
+
+        try
+        {
+            if (header.startsWith(PDF_HEADER))
+            {
+                float pdfVersion = Float.parseFloat(
+                        header.substring(PDF_HEADER.length(), Math.min(header.length(), PDF_HEADER.length() + 3)));
+                document.setVersion(pdfVersion);
+            }
+            else
+            {
+                float pdfVersion = Float.parseFloat(
+                        header.substring(FDF_HEADER.length(), Math.min(header.length(), FDF_HEADER.length() + 3)));
+                document.setVersion(pdfVersion);
+            }
+        }
+        catch (NumberFormatException e)
+        {
+            throw new IOException("Error getting pdf version:" + e);
+        }
+    }
 
 }