You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2016/05/10 18:59:44 UTC
svn commit: r1743248 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Author: jahewson
Date: Tue May 10 18:59:43 2016
New Revision: 1743248
URL: http://svn.apache.org/viewvc?rev=1743248&view=rev
Log:
PDFBOX-3347: fallback to ISO-8859-1 for names with invalid UTF-8
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1743248&r1=1743247&r2=1743248&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Tue May 10 18:59:43 2016
@@ -18,6 +18,9 @@ package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -776,11 +779,39 @@ public abstract class BaseParser
{
seqSource.unread(c);
}
- String string = new String(buffer.toByteArray(), Charsets.UTF_8);
+
+ byte[] bytes = buffer.toByteArray();
+ String string;
+ if (isValidUTF8(bytes))
+ {
+ string = new String(buffer.toByteArray(), Charsets.UTF_8);
+ }
+ else
+ {
+ // some malformed PDFs don't use UTF-8 see PDFBOX-3347
+ string = new String(buffer.toByteArray(), Charsets.ISO_8859_1);
+ }
return COSName.getPDFName(string);
}
/**
+ * Returns true if a byte sequence is valid UTF-8.
+ */
+ private boolean isValidUTF8(byte[] input)
+ {
+ CharsetDecoder cs = Charsets.UTF_8.newDecoder();
+ try
+ {
+ cs.decode(ByteBuffer.wrap(input));
+ return true;
+ }
+ catch (CharacterCodingException e)
+ {
+ return false;
+ }
+ }
+
+ /**
* This will parse a boolean object from the stream.
*
* @return The parsed boolean object.