You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2016/05/10 18:59:44 UTC

svn commit: r1743248 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Author: jahewson
Date: Tue May 10 18:59:43 2016
New Revision: 1743248

URL: http://svn.apache.org/viewvc?rev=1743248&view=rev
Log:
PDFBOX-3347: fallback to ISO-8859-1 for names with invalid UTF-8

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1743248&r1=1743247&r2=1743248&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Tue May 10 18:59:43 2016
@@ -18,6 +18,9 @@ package org.apache.pdfbox.pdfparser;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
 import java.util.Arrays;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -776,11 +779,39 @@ public abstract class BaseParser
         {
             seqSource.unread(c);
         }
-        String string = new String(buffer.toByteArray(), Charsets.UTF_8);
+        
+        byte[] bytes = buffer.toByteArray();
+        String string;
+        if (isValidUTF8(bytes))
+        {
+            string = new String(buffer.toByteArray(), Charsets.UTF_8);
+        }
+        else
+        {
+            // some malformed PDFs don't use UTF-8 see PDFBOX-3347
+            string = new String(buffer.toByteArray(), Charsets.ISO_8859_1);
+        }
         return COSName.getPDFName(string);
     }
 
     /**
+     * Returns true if a byte sequence is valid UTF-8.
+     */
+    private boolean isValidUTF8(byte[] input)
+    {
+        CharsetDecoder cs = Charsets.UTF_8.newDecoder();
+        try
+        {
+            cs.decode(ByteBuffer.wrap(input));
+            return true;
+        }
+        catch (CharacterCodingException e)
+        {
+            return false;
+        }
+    }
+    
+    /**
      * This will parse a boolean object from the stream.
      *
      * @return The parsed boolean object.