You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by je...@apache.org on 2014/05/29 12:35:53 UTC
svn commit: r1598250 - in /pdfbox/branches/1.8: pdfbox/src/main/java/org/apache/pdfbox/cos/ pdfbox/src/main/java/org/apache/pdfbox/encoding/ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/ pdfbox/src/...

Author: jeremias
Date: Thu May 29 10:35:52 2014
New Revision: 1598250

URL: http://svn.apache.org/r1598250
Log:
PDFBOX-2102: Fix for swallowed character in COSString.getString().
Introduced a java.nio.charset.Charset subclass implementing "PDFDocEncoding" fully, not just partially like in PdfDocEncoding. This should also be faster than the other class. It also made it possible to remove the requirement for the "isDictionaryValue" parameter introduced with PDFBOX-1437. Methods and constructors with that parameter are now marked deprecated and they have no internal references anymore. Compatibility of the change has been checked against the test file in PDFBOX-1437 and a unit test covers it, too.
There is a META-INF/services registration for the CharsetProvider but it is not used by PDFBox's code right now because it might be unreliable when PDFBox is used in environments like a WAR due to class loading issues. Still, it does no harm and maybe someone may find it useful.

Added:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java   (with props)
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java   (with props)
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java   (with props)
    pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/
    pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/
    pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/java.nio.charset.spi.CharsetProvider
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java   (with props)
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html   (with props)
Modified:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/COSWriter.java
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSString.java
    pdfbox/branches/1.8/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/cos/COSString.java Thu May 29 10:35:52 2014
@@ -19,16 +19,19 @@ package org.apache.pdfbox.cos;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-import org.apache.pdfbox.encoding.PdfDocEncoding;
+import org.apache.pdfbox.encoding.PDFDocEncodingCharset;
 import org.apache.pdfbox.exceptions.COSVisitorException;
 import org.apache.pdfbox.persistence.util.COSHEXTable;
 
 /**
  * This represents a string object in a PDF document.
- * 
+ *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.30 $
  */
@@ -89,8 +92,6 @@ public class COSString extends COSBase
      */
     private boolean forceHexForm = false;
 
-    private boolean isDictionary = false;
-
     /**
      * Constructor.
      */
@@ -99,20 +100,21 @@ public class COSString extends COSBase
         out = new ByteArrayOutputStream();
     }
 
-    /** 
+    /**
      * Constructor.
-     * 
+     *
      * @param isDictionaryValue determines if this string represents a dictionary
+     * @deprecated Not needed anymore. Use {@link #COSString()} instead. PDFBOX-1437
      */
+    @Deprecated
     public COSString(boolean isDictionaryValue)
     {
         this();
-        isDictionary = isDictionaryValue;
     }
 
     /**
      * Explicit constructor for ease of manual PDF construction.
-     * 
+     *
      * @param value
      *            The string value of the object.
      */
@@ -155,7 +157,7 @@ public class COSString extends COSBase
 
     /**
      * Explicit constructor for ease of manual PDF construction.
-     * 
+     *
      * @param value
      *            The string value of the object.
      */
@@ -175,7 +177,7 @@ public class COSString extends COSBase
 
     /**
      * Forces the string to be written in literal form instead of hexadecimal form.
-     * 
+     *
      * @param v
      *            if v is true the string will be written in literal form, otherwise it will be written in hexa if
      *            necessary.
@@ -188,7 +190,7 @@ public class COSString extends COSBase
 
     /**
      * Forces the string to be written in hexadecimal form instead of literal form.
-     * 
+     *
      * @param v
      *            if v is true the string will be written in hexadecimal form otherwise it will be written in literal if
      *            necessary.
@@ -201,7 +203,7 @@ public class COSString extends COSBase
 
     /**
      * This will create a COS string from a string of hex characters.
-     * 
+     *
      * @param hex
      *            A hex string.
      * @return A cos string with the hex characters converted to their actual bytes.
@@ -215,7 +217,7 @@ public class COSString extends COSBase
 
     /**
      * Creates a COS string from a string of hex characters, optionally ignoring malformed input.
-     * 
+     *
      * @param hex
      *            A hex string.
      * @param force
@@ -259,7 +261,7 @@ public class COSString extends COSBase
 
     /**
      * This will take this string and create a hex representation of the bytes that make the string.
-     * 
+     *
      * @return A hex string representing the bytes in this string.
      */
     public String getHexString()
@@ -277,92 +279,74 @@ public class COSString extends COSBase
 
     /**
      * This will get the string that this object wraps.
-     * 
+     *
      * @return The wrapped string.
      */
     public String getString()
     {
-        if (this.str != null)
+        if (str != null)
         {
-            return this.str;
+            return str;
         }
         String retval;
-        String encoding = "ISO-8859-1";
+        Charset charset = PDFDocEncodingCharset.INSTANCE;
         byte[] data = getBytes();
         int start = 0;
         if (data.length > 2)
         {
             if (data[0] == (byte) 0xFF && data[1] == (byte) 0xFE)
             {
-                encoding = "UTF-16LE";
+                charset = Charset.forName("UTF-16LE");
                 start = 2;
             }
             else if (data[0] == (byte) 0xFE && data[1] == (byte) 0xFF)
             {
-                encoding = "UTF-16BE";
+                charset = Charset.forName("UTF-16BE");
                 start = 2;
             }
         }
-        try
-        {
-            if (isDictionary && encoding.equals("ISO-8859-1"))
-            {
-                byte[] tmp = getBytes();
-                PdfDocEncoding pde = new PdfDocEncoding();
-                StringBuilder sb = new StringBuilder(tmp.length);
-                for (byte b : tmp)
-                {
-                    final String character = pde.getCharacter((b + 256) % 256);
-                    if (character != null)
-                    {
-                        sb.append(character);
-                    }
-                }
-                retval = sb.toString();
-            }
-            else
-            {
-                retval = new String(getBytes(), start, data.length - start, encoding);
-            }
-        }
-        catch (IOException e)
-        {
-            // should never happen
-            LOG.error (e,e);
-            retval = new String(getBytes());
-        }
-        this.str = retval;
+
+        retval = toString(data, start, data.length - start, charset);
+        str = retval;
         return retval;
     }
 
+    private static String toString(byte[] data, int offset, int length, Charset charset)
+    {
+        //This is only needed until PDFBox switches to JavaSE-1.6
+        //This could be just: return new String(data, offset, length, charset);
+        CharBuffer charBuffer = charset.decode(ByteBuffer.wrap(data, offset, length));
+        return charBuffer.toString();
+    }
+
     /**
      * This will append a byte[] to the string.
-     * 
+     *
      * @param data
      *            The byte[] to add to this string.
-     * 
+     *
      * @throws IOException
      *             If an IO error occurs while writing the byte.
      */
     public void append(byte[] data) throws IOException
     {
         out.write(data);
-        this.str = null;
+        str = null;
     }
 
     /**
      * This will append a byte to the string.
-     * 
+     *
      * @param in
      *            The byte to add to this string.
-     * 
+     *
      * @throws IOException
      *             If an IO error occurs while writing the byte.
      */
     public void append(int in) throws IOException
     {
         out.write(in);
-        this.str = null;
+        str = null;
     }
 
     /**
@@ -371,12 +355,12 @@ public class COSString extends COSBase
     public void reset()
     {
         out.reset();
-        this.str = null;
+        str = null;
     }
 
     /**
      * This will get the bytes of the string.
-     * 
+     *
      * @return A byte array that represents the string.
      */
     public byte[] getBytes()
@@ -395,7 +379,7 @@ public class COSString extends COSBase
 
     /**
      * This will output this string as a PDF object.
-     * 
+     *
      * @param output
      *            The stream to write to.
      * @throws IOException
@@ -475,7 +459,7 @@ public class COSString extends COSBase
 
     /**
      * visitor pattern double dispatch method.
-     * 
+     *
      * @param visitor
      *            The object to notify when visiting this object.
      * @return any object, depending on the visitor implementation, or null
@@ -497,7 +481,7 @@ public class COSString extends COSBase
         if (obj instanceof COSString)
         {
             COSString strObj = (COSString) obj;
-            return this.getString().equals(strObj.getString()) && this.forceHexForm == strObj.forceHexForm;
+            return this.getString().equals(strObj.getString()) && forceHexForm == strObj.forceHexForm;
         }
         return false;
     }

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java Thu May 29 10:35:52 2014
@@ -296,7 +296,7 @@ public abstract class Encoding implement
         String name = getName( code );
         if (name != null)
         {
-            return getCharacter( getName( code ) );
+            return getCharacter( name );
         }
         return null;
     }
@@ -349,7 +349,7 @@ public abstract class Encoding implement
                     character = name;
                 }
             }
-            // test for an alternate Unicode name representation 
+            // test for an alternate Unicode name representation
             else if ( name.startsWith( "u" ) )
             {
                 try

Added: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java Thu May 29 10:35:52 2014
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding;
+
+import java.nio.charset.Charset;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * {@link CharsetProvider} implementation for publishing PDFBox's encodings.
+ * @version $Revision$
+ */
+public class PDFBoxCharsetProvider extends CharsetProvider
+{
+
+    private final Set<Charset> available = new java.util.HashSet<Charset>();
+    private final Map<String, Charset> map = new java.util.HashMap<String, Charset>();
+
+    /**
+     * Constructor.
+     */
+    public PDFBoxCharsetProvider()
+    {
+        available.add(PDFDocEncodingCharset.INSTANCE);
+        for (Charset cs : available)
+        {
+            map.put(cs.name(), cs);
+            for (String alias : cs.aliases())
+            {
+                map.put(alias, cs);
+            }
+        }
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public Iterator<Charset> charsets()
+    {
+        return Collections.unmodifiableSet(available).iterator();
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public Charset charsetForName(String charsetName)
+    {
+        return map.get(charsetName);
+    }
+
+}

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFBoxCharsetProvider.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java Thu May 29 10:35:52 2014
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding;
+
+import java.nio.charset.Charset;
+
+/**
+ * {@link Charset} implementation for the "PDFDocEncoding" from the PDF specification.
+ * @version $Revision$
+ */
+public class PDFDocEncodingCharset extends SingleByteCharset
+{
+
+    /** Canonical name for the PDFDocEncoding. */
+    public static final String NAME = "PDFDocEncoding";
+
+    /** Singleton instance. */
+    public static final PDFDocEncodingCharset INSTANCE = new PDFDocEncodingCharset();
+
+    /**
+     * Creates a new "PDFDocEncoding" charset.
+     */
+    public PDFDocEncodingCharset()
+    {
+        super(NAME, null, createEncoding());
+    }
+
+    private static char[] createEncoding()
+    {
+        char[] encoding = new char[256];
+
+        //Initialize with basically ISO-8859-1
+        for (int i = 0; i < 255; i++)
+        {
+            encoding[i] = (char)i;
+        }
+        //...then do all deviations (based on the table in ISO 32000-1:2008)
+        //block 1
+        encoding[0x18] = '\u02D8'; //BREVE
+        encoding[0x19] = '\u02C7'; //CARON
+        encoding[0x1A] = '\u02C6'; //MODIFIER LETTER CIRCUMFLEX ACCENT
+        encoding[0x1B] = '\u02D9'; //DOT ABOVE
+        encoding[0x1C] = '\u02DD'; //DOUBLE ACUTE ACCENT
+        encoding[0x1D] = '\u02DB'; //OGONEK
+        encoding[0x1E] = '\u02DA'; //RING ABOVE
+        encoding[0x1F] = '\u02DC'; //SMALL TILDE
+        //block 2
+        encoding[0x7F] = REPLACEMENT_CHARACTER; //undefined
+        encoding[0x80] = '\u2022'; //BULLET
+        encoding[0x81] = '\u2020'; //DAGGER
+        encoding[0x82] = '\u2021'; //DOUBLE DAGGER
+        encoding[0x83] = '\u2026'; //HORIZONTAL ELLIPSIS
+        encoding[0x84] = '\u2014'; //EM DASH
+        encoding[0x85] = '\u2013'; //EN DASH
+        encoding[0x86] = '\u0192'; //LATIN SMALL LETTER SCRIPT F
+        encoding[0x87] = '\u2044'; //FRACTION SLASH (solidus)
+        encoding[0x88] = '\u2039'; //SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+        encoding[0x89] = '\u203A'; //SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+        encoding[0x8A] = '\u2212'; //MINUS SIGN
+        encoding[0x8B] = '\u2030'; //PER MILLE SIGN
+        encoding[0x8C] = '\u201E'; //DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+        encoding[0x8D] = '\u201C'; //LEFT DOUBLE QUOTATION MARK (double quote left)
+        encoding[0x8E] = '\u201D'; //RIGHT DOUBLE QUOTATION MARK (quotedblright)
+        encoding[0x8F] = '\u2018'; //LEFT SINGLE QUOTATION MARK (quoteleft)
+        encoding[0x90] = '\u2019'; //RIGHT SINGLE QUOTATION MARK (quoteright)
+        encoding[0x91] = '\u201A'; //SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+        encoding[0x92] = '\u2122'; //TRADE MARK SIGN
+        encoding[0x93] = '\uFB01'; //LATIN SMALL LIGATURE FI
+        encoding[0x94] = '\uFB02'; //LATIN SMALL LIGATURE FL
+        encoding[0x95] = '\u0141'; //LATIN CAPITAL LETTER L WITH STROKE
+        encoding[0x96] = '\u0152'; //LATIN CAPITAL LIGATURE OE
+        encoding[0x97] = '\u0160'; //LATIN CAPITAL LETTER S WITH CARON
+        encoding[0x98] = '\u0178'; //LATIN CAPITAL LETTER Y WITH DIAERESIS
+        encoding[0x99] = '\u017D'; //LATIN CAPITAL LETTER Z WITH CARON
+        encoding[0x9A] = '\u0131'; //LATIN SMALL LETTER DOTLESS I
+        encoding[0x9B] = '\u0142'; //LATIN SMALL LETTER L WITH STROKE
+        encoding[0x9C] = '\u0153'; //LATIN SMALL LIGATURE OE
+        encoding[0x9D] = '\u0161'; //LATIN SMALL LETTER S WITH CARON
+        encoding[0x9E] = '\u017E'; //LATIN SMALL LETTER Z WITH CARON
+        encoding[0x9F] = REPLACEMENT_CHARACTER; //undefined
+        encoding[0xA0] = '\u20AC'; //EURO SIGN
+        //end of deviations
+        return encoding;
+    }
+
+}

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/PDFDocEncodingCharset.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java Thu May 29 10:35:52 2014
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+
+/**
+ * {@link Charset} implementation for the single-byte encodings.
+ * @version $Revision$
+ */
+public class SingleByteCharset extends Charset
+{
+
+    /** Unicode replacement character 0xFFFD. */
+    protected static final char REPLACEMENT_CHARACTER = '\uFFFD';
+
+    private final char[] toUnicodeMap;
+    private byte[][] toByteMap;
+
+    /**
+     * Creates a new single-byte charset using an array of unicode characters.
+     * @param canonicalName the canonical name
+     * @param aliases An array of this charset's aliases, or null if it has no aliases
+     * @param toUnicodeMap the array of unicode characters (may have a maximum of 256 characters,
+     *          first character must be 0x0000)
+     */
+    protected SingleByteCharset(String canonicalName, String[] aliases, char[] toUnicodeMap)
+    {
+        super(canonicalName, aliases);
+        if (toUnicodeMap.length > 256)
+        {
+            throw new IllegalArgumentException("Single-byte encodings may have at most 256 characters.");
+        }
+        //Copy array so it cannot be changed accidentally from the outside
+        this.toUnicodeMap = new char[256];
+        System.arraycopy(toUnicodeMap, 0, this.toUnicodeMap, 0, toUnicodeMap.length);
+        //build the inverse lookup table
+        initInverseMap();
+    }
+
+    private void initInverseMap()
+    {
+        toByteMap = new byte[256][];
+        if (toUnicodeMap[0] != '\u0000')
+        {
+            throw new IllegalArgumentException("First character in map must be a NUL (0x0000) character.");
+            //because we're using 0x00 for encoding otherwise unmapped characters
+        }
+
+        //we're building a kind of sparse lookup table in which not all subranges are covered.
+        for (int i = 1, len = toUnicodeMap.length; i < len; i++)
+        {
+            char ch = toUnicodeMap[i];
+            if (ch == REPLACEMENT_CHARACTER)
+            {
+                continue; //skip
+            }
+            int upper = ch >> 8;
+            int lower = ch & 0xFF;
+            if (upper > 0xFF)
+            {
+                throw new IllegalArgumentException("Not a compatible character: "
+                        + ch + " (" + Integer.toHexString(ch) + ")");
+            }
+            byte[] map = toByteMap[upper];
+            if (map == null)
+            {
+                map = new byte[256];
+                toByteMap[upper] = map;
+            }
+            map[lower] = (byte)(i & 0xFF);
+        }
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public boolean contains(Charset cs)
+    {
+        return (cs.getClass() == getClass());
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public CharsetDecoder newDecoder()
+    {
+        return new Decoder();
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public CharsetEncoder newEncoder()
+    {
+        return new Encoder();
+    }
+
+    /** The decoder. */
+    private class Decoder extends CharsetDecoder
+    {
+
+        protected Decoder()
+        {
+            super(SingleByteCharset.this, 1, 1);
+        }
+
+        @Override
+        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out)
+        {
+            while (in.hasRemaining())
+            {
+                byte b = in.get();
+                char ch;
+
+                if (!out.hasRemaining())
+                {
+                    in.position(in.position() - 1);
+                    return CoderResult.OVERFLOW;
+                }
+                ch = toUnicodeMap[b & 0xFF];
+                if (ch == REPLACEMENT_CHARACTER)
+                {
+                    return CoderResult.unmappableForLength(1);
+                }
+                out.put(ch);
+            }
+            return CoderResult.UNDERFLOW;
+        }
+
+    }
+
+    /** The encoder. */
+    private class Encoder extends CharsetEncoder
+    {
+
+        protected Encoder()
+        {
+            super(SingleByteCharset.this, 1, 1);
+        }
+
+        @Override
+        protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out)
+        {
+            while (in.hasRemaining())
+            {
+                int ch = in.get();
+
+                if (!out.hasRemaining())
+                {
+                    in.position(in.position() - 1);
+                    return CoderResult.OVERFLOW;
+                }
+
+                int upper = ch >> 8;
+                int lower = ch & 0xFF;
+                if (upper > 0xFF)
+                {
+                    in.position(in.position() - 1);
+                    return CoderResult.unmappableForLength(1);
+                }
+                byte[] map = toByteMap[upper];
+                if (map == null)
+                {
+                    in.position(in.position() - 1);
+                    return CoderResult.unmappableForLength(1);
+                }
+                byte b = map[lower];
+                if (b == 0x00)
+                {
+                    in.position(in.position() - 1);
+                    return CoderResult.unmappableForLength(1);
+                }
+
+                out.put(b);
+            }
+            return CoderResult.UNDERFLOW;
+        }
+
+    }
+
+}

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/encoding/SingleByteCharset.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu May 29 10:35:52 2014
@@ -48,7 +48,7 @@ import org.apache.pdfbox.persistence.uti
  * PDFParser and the COSStreamParser.
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * 
+ * @version $Revision$
  */
 public abstract class BaseParser
 {
@@ -56,7 +56,7 @@ public abstract class BaseParser
     private static final long OBJECT_NUMBER_THRESHOLD = 10000000000L;
 
     private static final long GENERATION_NUMBER_THRESHOLD = 65535;
-    
+
     /**
      * system property allowing to define size of push back buffer.
      */
@@ -137,13 +137,13 @@ public abstract class BaseParser
         {
             FORCE_PARSING = Boolean.getBoolean("org.apache.pdfbox.forceParsing");
         }
-        catch (SecurityException e)  
+        catch (SecurityException e)
         {
             // PDFBOX-1946 since Boolean.getBoolean calls System.getProperty, this can occur
             /* ignore and use default */
         }
     }
-    
+
     /**
      * This is the stream that will be read from.
      */
@@ -164,7 +164,7 @@ public abstract class BaseParser
      */
     public BaseParser()
     {
-        this.forceParsing = FORCE_PARSING;
+        forceParsing = FORCE_PARSING;
     }
 
     /**
@@ -179,18 +179,18 @@ public abstract class BaseParser
     public BaseParser(InputStream input, boolean forceParsingValue)
             throws IOException
     {
-    	int pushbacksize = 65536;
-    	try
-		{
-			pushbacksize = Integer.getInteger( PROP_PUSHBACK_SIZE, 65536 );
-		}
-		catch (SecurityException e)  // getInteger calls System.getProperties, which can get exception
-		{
-			// ignore and use default
-		}
-        this.pdfSource = new PushBackInputStream(
-        		new BufferedInputStream(input, 16384), pushbacksize );
-        this.forceParsing = forceParsingValue;
+        int pushbacksize = 65536;
+        try
+        {
+            pushbacksize = Integer.getInteger( PROP_PUSHBACK_SIZE, 65536 );
+        }
+        catch (SecurityException e)  // getInteger calls System.getProperties, which can get exception
+        {
+            // ignore and use default
+        }
+        pdfSource = new PushBackInputStream(
+                new BufferedInputStream(input, 16384), pushbacksize );
+        forceParsing = forceParsingValue;
     }
 
     /**
@@ -199,7 +199,7 @@ public abstract class BaseParser
      * @param input The input stream to read the data from.
      * @throws IOException If there is an error reading the input stream.
      */
-    public BaseParser(InputStream input) throws IOException 
+    public BaseParser(InputStream input) throws IOException
     {
         this(input, FORCE_PARSING);
     }
@@ -210,7 +210,7 @@ public abstract class BaseParser
      * @param input The array to read the data from.
      * @throws IOException If there is an error reading the byte data.
      */
-    protected BaseParser(byte[] input) throws IOException 
+    protected BaseParser(byte[] input) throws IOException
     {
         this(new ByteArrayInputStream(input));
     }
@@ -309,31 +309,31 @@ public abstract class BaseParser
                     {
                         // in addition to stopping when we find / or >, we also want
                         // to stop when we find endstream or endobj.
-                        if(read==E) 
+                        if(read==E)
                         {
                             read = pdfSource.read();
-                            if(read==N) 
+                            if(read==N)
                             {
                                 read = pdfSource.read();
                                 if(read==D)
                                 {
                                     read = pdfSource.read();
-                                    if(read==S) 
+                                    if(read==S)
                                     {
                                         read = pdfSource.read();
-                                        if(read==T) 
+                                        if(read==T)
                                         {
                                             read = pdfSource.read();
-                                            if(read==R) 
+                                            if(read==R)
                                             {
                                                 read = pdfSource.read();
-                                                if(read==E) 
+                                                if(read==E)
                                                 {
                                                     read = pdfSource.read();
-                                                    if(read==A) 
+                                                    if(read==A)
                                                     {
                                                         read = pdfSource.read();
-                                                        if(read==M) 
+                                                        if(read==M)
                                                         {
                                                             return obj; // we're done reading this object!
                                                         }
@@ -341,14 +341,14 @@ public abstract class BaseParser
                                                 }
                                             }
                                         }
-                                    } 
-                                    else if(read==O) 
+                                    }
+                                    else if(read==O)
                                     {
                                         read = pdfSource.read();
-                                        if(read==B) 
+                                        if(read==B)
                                         {
                                             read = pdfSource.read();
-                                            if(read==J) 
+                                            if(read==J)
                                             {
                                                 return obj; // we're done reading this object!
                                             }
@@ -489,7 +489,7 @@ public abstract class BaseParser
 // we do not know if length object is redefined later on and the currently
 // read indirect object might be obsolete (e.g. not referenced in xref table);
 // this would result in reading wrong number of bytes;
-// Thus the only reliable information is a direct length. 
+// Thus the only reliable information is a direct length.
 // This exclusion shouldn't harm much since in case of indirect objects they will
 // typically be defined after the stream object, thus keeping the directly
 // provided length will fix most cases
@@ -497,8 +497,8 @@ public abstract class BaseParser
 //                      ( ( (COSObject) streamLength ).getObject() instanceof COSNumber ) )
 //            {
 //                length = ( (COSNumber) ( (COSObject) streamLength ).getObject() ).intValue();
-//            } 
-            
+//            }
+
             if ( length == -1 )
             {
                 // Couldn't determine length from dict: just
@@ -520,10 +520,10 @@ public abstract class BaseParser
                     out.write( strmBuf, 0, readCount );
                     left -= readCount;
                 }
-                
+
                 // in order to handle broken documents we test if 'endstream' is reached
                 // if not, length value possibly was wrong, fall back to scanning for endstream
-                
+
                 // fill buffer with next bytes and test for 'endstream' (with leading whitespaces)
                 int readCount = pdfSource.read( strmBuf, 0, 20 );
                 if ( readCount > 0 )
@@ -532,7 +532,7 @@ public abstract class BaseParser
                     int     nextEndstreamCIdx = 0;
                     for ( int cIdx = 0; cIdx < readCount; cIdx++ )
                     {
-                        final int ch = strmBuf[ cIdx ] & 0xff; 
+                        final int ch = strmBuf[ cIdx ] & 0xff;
                         if ( ch == ENDSTREAM[ nextEndstreamCIdx ] )
                         {
                             if ( ++nextEndstreamCIdx >= ENDSTREAM.length )
@@ -547,26 +547,23 @@ public abstract class BaseParser
                             break;
                         }
                     }
-                    
+
                     // push back test bytes
                     pdfSource.unread( strmBuf, 0, readCount );
-                    
+
                     // if 'endstream' was not found fall back to scanning
                     if ( ! foundEndstream )
                     {
-                        LOG.warn("Specified stream length " + length 
+                        LOG.warn("Specified stream length " + length
                                 + " is wrong. Fall back to reading stream until 'endstream'.");
-                        
+
                         // push back all read stream bytes
                         // we got a buffered stream wrapper around filteredStream thus first flush to underlying stream
                         out.flush();
                         InputStream writtenStreamBytes = stream.getFilteredStream();
                         ByteArrayOutputStream bout = new ByteArrayOutputStream( length );
-                        
-                        while ( ( readCount = writtenStreamBytes.read( strmBuf ) ) >= 0 )
-                        {
-                            bout.write( strmBuf, 0, readCount );
-                        }
+
+                        IOUtils.copy(writtenStreamBytes, bout);
                         IOUtils.closeQuietly(writtenStreamBytes);
                         try
                         {
@@ -574,20 +571,20 @@ public abstract class BaseParser
                         }
                         catch ( IOException ioe )
                         {
-                            throw new WrappedIOException( "Could not push back " + bout.size() + 
+                            throw new WrappedIOException( "Could not push back " + bout.size() +
                                                           " bytes in order to reparse stream. " +
                                                           "Try increasing push back buffer using system property " +
                                                           PROP_PUSHBACK_SIZE, ioe );
                         }
                         // close and create new filtered stream
-                      	IOUtils.closeQuietly(out);
+                        IOUtils.closeQuietly(out);
                         out = stream.createFilteredStream( streamLength );
                         // scan until we find endstream:
                         readUntilEndStream(new EndstreamOutputStream(out));
                     }
                 }
             }
-            
+
             skipSpaces();
             String endStream = readString();
 
@@ -646,12 +643,12 @@ public abstract class BaseParser
      * object. Some pdf files, however, forget to write some endstream tags
      * and just close off objects with an "endobj" tag so we have to handle
      * this case as well.
-     * 
+     *
      * This method is optimized using buffered IO and reduced number of
      * byte compare operations.
-     * 
+     *
      * @param out  stream we write out to.
-     * 
+     *
      * @throws IOException
      */
     private void readUntilEndStream( final OutputStream out ) throws IOException
@@ -660,19 +657,19 @@ public abstract class BaseParser
         int bufSize;
         int charMatchCount = 0;
         byte[] keyw = ENDSTREAM;
-        
+
         final int quickTestOffset = 5;  // last character position of shortest keyword ('endobj')
-        
+
         // read next chunk into buffer; already matched chars are added to beginning of buffer
-        while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, strmBufLen - charMatchCount ) ) > 0 ) 
+        while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, strmBufLen - charMatchCount ) ) > 0 )
         {
             bufSize += charMatchCount;
-            
+
             int bIdx = charMatchCount;
             int quickTestIdx;
-        
+
             // iterate over buffer, trying to find keyword match
-            for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ ) 
+            for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ )
             {
                 // reduce compare operations by first test last character we would have to
                 // match if current one matches; if it is not a character from keywords
@@ -680,11 +677,11 @@ public abstract class BaseParser
                 // this shortcut is inspired by the Boyer-Moore string search algorithm
                 // and can reduce parsing time by approx. 20%
                 if ( ( charMatchCount == 0 ) &&
-                         ( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx ) ) 
+                         ( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx ) )
                 {
-                    
+
                     final byte ch = strmBuf[quickTestIdx];
-                    if ( ( ch > 't' ) || ( ch < 'a' ) ) 
+                    if ( ( ch > 't' ) || ( ch < 'a' ) )
                     {
                         // last character we would have to match if current character would match
                         // is not a character from keywords -> jump behind and start over
@@ -692,65 +689,65 @@ public abstract class BaseParser
                         continue;
                     }
                 }
-                
+
                 final byte ch = strmBuf[bIdx];  // could be negative - but we only compare to ASCII
-            
-                if ( ch == keyw[ charMatchCount ] ) 
+
+                if ( ch == keyw[ charMatchCount ] )
                 {
-                    if ( ++charMatchCount == keyw.length ) 
+                    if ( ++charMatchCount == keyw.length )
                     {
                         // match found
                         bIdx++;
                         break;
                     }
-                } 
-                else 
+                }
+                else
                 {
-                    if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) ) 
+                    if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) )
                     {
                         // maybe ENDSTREAM is missing but we could have ENDOBJ
                         keyw = ENDOBJ;
                         charMatchCount++;
-                    } 
-                    else 
+                    }
+                    else
                     {
                         // no match; incrementing match start by 1 would be dumb since we already know matched chars
                         // depending on current char read we may already have beginning of a new match:
                         // 'e': first char matched;
                         // 'n': if we are at match position idx 7 we already read 'e' thus 2 chars matched
-                        // for each other char we have to start matching first keyword char beginning with next 
+                        // for each other char we have to start matching first keyword char beginning with next
                         // read position
                         charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0;
                         // search again for 'endstream'
                         keyw = ENDSTREAM;
                     }
-                } 
+                }
             }  // for
-            
+
             int contentBytes = Math.max( 0, bIdx - charMatchCount );
-            
+
             // write buffer content until first matched char to output stream
             if ( contentBytes > 0 )
             {
                 out.write( strmBuf, 0, contentBytes );
             }
-            if ( charMatchCount == keyw.length ) 
+            if ( charMatchCount == keyw.length )
             {
                 // keyword matched; unread matched keyword (endstream/endobj) and following buffered content
                 pdfSource.unread( strmBuf, contentBytes, bufSize - contentBytes );
                 break;
-            } 
-            else 
+            }
+            else
             {
                 // copy matched chars at start of buffer
                 System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
             }
-            
+
         }  // while
         
         out.flush(); // this writes a lonely CR or drops trailing CR LF and LF
     }
-    
+
     /**
      * This is really a bug in the Document creators code, but it caused a crash
      * in PDFBox, the first bug was in this format:
@@ -811,6 +808,7 @@ public abstract class BaseParser
         }
         return braces;
     }
+
     /**
      * This will parse a PDF string.
      *
@@ -818,11 +816,25 @@ public abstract class BaseParser
      * @return The parsed PDF string.
      *
      * @throws IOException If there is an error reading from the stream.
+     * @deprecated Not needed anymore. Use {@link #parseCOSString()} instead. PDFBOX-1437
      */
+    @Deprecated
     protected COSString parseCOSString(boolean isDictionary) throws IOException
     {
+        return parseCOSString();
+    }
+
+    /**
+     * This will parse a PDF string.
+     *
+     * @return The parsed PDF string.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected COSString parseCOSString() throws IOException
+    {
         char nextChar = (char)pdfSource.read();
-        COSString retval = new COSString(isDictionary);
+        COSString retval = new COSString();
         char openBrace;
         char closeBrace;
         if( nextChar == '(' )
@@ -942,7 +954,7 @@ public abstract class BaseParser
                         {
                             nextc = c;
                         }
-    
+
                         int character = 0;
                         try
                         {
@@ -990,7 +1002,7 @@ public abstract class BaseParser
      * be able to skip to next object start.
      *
      * We assume starting '&lt;' was already read.
-     * 
+     *
      * @return The parsed PDF string.
      *
      * @throws IOException If there is an error reading from the stream.
@@ -1009,7 +1021,7 @@ public abstract class BaseParser
             {
                 break;
             }
-            else if ( c < 0 ) 
+            else if ( c < 0 )
             {
                 throw new IOException( "Missing closing bracket for hex string. Reached EOS." );
             }
@@ -1027,28 +1039,28 @@ public abstract class BaseParser
                 {
                     sBuf.deleteCharAt(sBuf.length()-1);
                 }
-                
+
                 // read till the closing bracket was found
-                do 
+                do
                 {
                     c = pdfSource.read();
                 } while ( c != '>' && c >= 0 );
-                
+
                 // might have reached EOF while looking for the closing bracket
                 // this can happen for malformed PDFs only. Make sure that there is
                 // no endless loop.
-                if ( c < 0 ) 
+                if ( c < 0 )
                 {
                     throw new IOException( "Missing closing bracket for hex string. Reached EOS." );
                 }
-                
+
                 // exit loop
                 break;
             }
         }
         return COSString.createFromHexString( sBuf.toString(), forceParsing );
     }
-    
+
     /**
      * This will parse a PDF array object.
      *
@@ -1272,7 +1284,7 @@ public abstract class BaseParser
             }
             else
             {
-                retval = parseCOSString(true);
+                retval = parseCOSString();
             }
             break;
         }
@@ -1282,7 +1294,7 @@ public abstract class BaseParser
             break;
         }
         case '(':
-            retval = parseCOSString(true);
+            retval = parseCOSString();
             break;
         case '/':   // name
             retval = parseCOSName();
@@ -1366,7 +1378,7 @@ public abstract class BaseParser
                     int peek = pdfSource.peek();
                     // we can end up in an infinite loop otherwise
                     throw new IOException( "Unknown dir object c='" + c +
-                            "' cInt=" + (int)c + " peek='" + (char)peek 
+                            "' cInt=" + (int)c + " peek='" + (char)peek
                             + "' peekInt=" + peek + " " + pdfSource.getOffset() );
                 }
 
@@ -1622,32 +1634,34 @@ public abstract class BaseParser
      * This will read a long from the Stream and throw an {@link IllegalArgumentException} if the long value
      * has more than 10 digits (i.e. : bigger than {@link #OBJECT_NUMBER_THRESHOLD})
      * @return the object number being read.
-     * @throws IOException
+     * @throws IOException if an I/O error occurs
      */
     protected long readObjectNumber() throws IOException
     {
         long retval = readLong();
-        if(retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD) {
+        if(retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
+        {
             throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
         }
         return retval;
     }
-    
+
     /**
      * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
      * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
      * @return the generation number being read.
-     * @throws IOException
+     * @throws IOException if an I/O error occurs
      */
     protected int readGenerationNumber() throws IOException
     {
         int retval = readInt();
-        if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD) {
+        if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
+        {
             throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
         }
         return retval;
     }
-    
+
     /**
      * This will read an integer from the stream.
      *
@@ -1673,7 +1687,7 @@ public abstract class BaseParser
         }
         return retval;
     }
-    
+
 
     /**
      * This will read an long from the stream.
@@ -1696,14 +1710,16 @@ public abstract class BaseParser
         catch( NumberFormatException e )
         {
             pdfSource.unread(longBuffer.toString().getBytes("ISO-8859-1"));
-            throw new IOException( "Error: Expected a long type at offset "+pdfSource.getOffset() + ", instead got '" + longBuffer + "'");
+            throw new IOException( "Error: Expected a long type at offset "
+                    + pdfSource.getOffset() + ", instead got '" + longBuffer + "'");
         }
         return retval;
     }
 
     /**
-     * This method is used to read a token by the {@linkplain #readInt()} method and the {@linkplain #readLong()} method.
-     *  
+     * This method is used to read a token by the {@linkplain #readInt()} method
+     * and the {@linkplain #readLong()} method.
+     *
      * @return the token to parse as integer or long by the calling method.
      * @throws IOException throws by the {@link #pdfSource} methods.
      */
@@ -1733,11 +1749,11 @@ public abstract class BaseParser
      */
     public void clearResources()
     {
-    	document = null;
-    	if (pdfSource != null)
-    	{
-    		IOUtils.closeQuietly(pdfSource);
-    		pdfSource = null;
-    	}
+        document = null;
+        if (pdfSource != null)
+        {
+            IOUtils.closeQuietly(pdfSource);
+            pdfSource = null;
+        }
     }
 }

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Thu May 29 10:35:52 2014
@@ -42,12 +42,12 @@ import org.apache.pdfbox.util.PDFOperato
  * This will parse a PDF byte stream and extract operands and such.
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * 
+ * @version $Revision$
  */
 public class PDFStreamParser extends BaseParser
 {
     private List<Object> streamObjects = new ArrayList<Object>( 100 );
-    private RandomAccess file;
+    private final RandomAccess file;
     private final int    maxBinCharTestLength = 5;
     private final byte[] binCharTestArr = new byte[maxBinCharTestLength];
 
@@ -261,7 +261,7 @@ public class PDFStreamParser extends Bas
                 }
                 else
                 {
-                    retval = parseCOSString(false);
+                    retval = parseCOSString();
                 }
                 break;
             }
@@ -271,7 +271,7 @@ public class PDFStreamParser extends Bas
                 break;
             }
             case '(': // string
-                retval = parseCOSString(false);
+                retval = parseCOSString();
                 break;
             case '/':   // name
                 retval = parseCOSName();
@@ -361,7 +361,7 @@ public class PDFStreamParser extends Bas
                 retval = PDFOperator.getOperator( next );
                 if( next.equals( "BI" ) )
                 {
-                	PDFOperator beginImageOP = (PDFOperator)retval;
+                    PDFOperator beginImageOP = (PDFOperator)retval;
                     COSDictionary imageParams = new COSDictionary();
                     beginImageOP.setImageParameters( new ImageParameters( imageParams ) );
                     Object nextToken = null;
@@ -530,12 +530,13 @@ public class PDFStreamParser extends Bas
      * {@inheritDoc}
      */
     @Override
-    public void clearResources() {
-    	super.clearResources();
-    	if (streamObjects != null)
-    	{
-    		streamObjects.clear();
-    		streamObjects = null;
-    	}
+    public void clearResources()
+    {
+        super.clearResources();
+        if (streamObjects != null)
+        {
+            streamObjects.clear();
+            streamObjects = null;
+        }
     }
 }

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/COSWriter.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/COSWriter.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/COSWriter.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/COSWriter.java Thu May 29 10:35:52 2014
@@ -923,15 +923,15 @@ public class COSWriter implements ICOSVi
                 COSBase current = i.next();
                 if( current instanceof COSDictionary )
                 {
-                	if (current.isDirect())
-                	{
-                		visitFromDictionary((COSDictionary)current);
-                	}
-                	else
-                	{
-                		addObjectToWrite( current );
-                		writeReference( current );
-                	}
+                    if (current.isDirect())
+                    {
+                        visitFromDictionary((COSDictionary)current);
+                    }
+                    else
+                    {
+                        addObjectToWrite( current );
+                        writeReference( current );
+                    }
                 }
                 else if( current instanceof COSObject )
                 {
@@ -952,7 +952,7 @@ public class COSWriter implements ICOSVi
                 }
                 else if( current instanceof COSString )
                 {
-                    COSString copy = new COSString(true);
+                    COSString copy = new COSString();
                     copy.append(((COSString)current).getBytes());
                     copy.accept(this);
                 }
@@ -1432,7 +1432,7 @@ public class COSWriter implements ICOSVi
      * @throws COSVisitorException If an error occurs while generating the data.
      */
     public void write(PDDocument doc) throws COSVisitorException
-	{
+    {
         Long idTime = doc.getDocumentId() == null ? System.currentTimeMillis() : 
                                                     doc.getDocumentId();
         

Added: pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/java.nio.charset.spi.CharsetProvider
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/java.nio.charset.spi.CharsetProvider?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/java.nio.charset.spi.CharsetProvider (added)
+++ pdfbox/branches/1.8/pdfbox/src/main/resources/META-INF/services/java.nio.charset.spi.CharsetProvider Thu May 29 10:35:52 2014
@@ -0,0 +1 @@
+org.apache.pdfbox.encoding.PDFBoxCharsetProvider
\ No newline at end of file

Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSString.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSString.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSString.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSString.java Thu May 29 10:35:52 2014
@@ -20,12 +20,12 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 
-import org.apache.pdfbox.exceptions.COSVisitorException;
-import org.apache.pdfbox.pdfwriter.COSWriter;
-
 import junit.framework.Test;
 import junit.framework.TestSuite;
 
+import org.apache.pdfbox.exceptions.COSVisitorException;
+import org.apache.pdfbox.pdfwriter.COSWriter;
+
 /**
  * This will test all of the filters in the PDFBox system.
  *
@@ -34,9 +34,9 @@ import junit.framework.TestSuite;
  */
 public class TestCOSString extends TestCOSBase
 {
-    private final static String ESC_CHAR_STRING =
+    private static final String ESC_CHAR_STRING =
             "( test#some) escaped< \\chars>!~1239857 ";
-    private final static String ESC_CHAR_STRING_PDF_FORMAT =
+    private static final String ESC_CHAR_STRING_PDF_FORMAT =
             "\\( test#some\\) escaped< \\\\chars>!~1239857 ";
 
     /**
@@ -49,6 +49,7 @@ public class TestCOSString extends TestC
         return new TestSuite(TestCOSString.class);
     }
 
+    @Override
     public void setUp()
     {
         testCOSBase = new COSString("test cos string");
@@ -66,7 +67,7 @@ public class TestCOSString extends TestC
     }
 
     /**
-     * Tests the public static members within the class that are purely PDF format string objects 
+     * Tests the public static members within the class that are purely PDF format string objects
      * like open/closing strings, escape characters etc...
      */
     public void testStaticMembers()
@@ -85,7 +86,7 @@ public class TestCOSString extends TestC
 
     /**
      * Helper method for comparing a string to it's PDF byte array.
-     * 
+     *
      * @param expected the String expected
      * @param member the byte array being tested
      */
@@ -105,7 +106,7 @@ public class TestCOSString extends TestC
 
     /**
      * Test setForceHexForm() and setForceLiteralForm() - tests these two methods do enforce the
-     * different String output forms within PDF. 
+     * different String output forms within PDF.
      */
     public void testSetForceHexLiteralForm()
     {
@@ -129,7 +130,7 @@ public class TestCOSString extends TestC
 
     /**
      * Helper method for testing writePDF().
-     * 
+     *
      * @param expected the String expected when writePDF() is invoked
      * @param testSubj the test subject
      */
@@ -200,8 +201,8 @@ public class TestCOSString extends TestC
         test1.setForceLiteralForm(true);
         assertEquals(hexForm, test1.getHexString());
         COSString escCS = new COSString(ESC_CHAR_STRING);
-        // Not sure whether the escaped characters should be escaped or not, presumably since 
-        // writePDF() gives you the proper formatted text, getHex() should ONLY convert to hex. 
+        // Not sure whether the escaped characters should be escaped or not, presumably since
+        // writePDF() gives you the proper formatted text, getHex() should ONLY convert to hex.
         assertEquals(createHex(ESC_CHAR_STRING), escCS.getHexString());
     }
 
@@ -230,6 +231,25 @@ public class TestCOSString extends TestC
 
             COSString escapedString = new COSString(ESC_CHAR_STRING);
             assertEquals(ESC_CHAR_STRING, escapedString.getString());
+
+            testStr = "Line1\nLine2\nLine3\n";
+            COSString lineFeedString = new COSString(testStr);
+            assertEquals(testStr, lineFeedString.getString());
+
+            //Same as previous but this time it is constructed incrementally (like in a dictionary)
+            lineFeedString = new COSString();
+            for (int i = 0; i < testStr.length(); i++)
+            {
+                lineFeedString.append(testStr.charAt(i));
+            }
+            assertEquals(testStr, lineFeedString.getString());
+
+            testStr = "Text\u2026"; //PDFBOX-1437
+            COSString pdfbox1437 = new COSString();
+            pdfbox1437.append(new byte[] {
+                    0x54, 0x65, 0x78, 0x74, (byte)(0x83 & 0xFF)
+            });
+            assertEquals(testStr, pdfbox1437.getString());
         }
         catch (IOException e)
         {
@@ -238,7 +258,7 @@ public class TestCOSString extends TestC
     }
 
     /**
-     * Test append(int) and append(byte[]) - test both code paths. 
+     * Test append(int) and append(byte[]) - test both code paths.
      */
     public void testAppend()
     {

Added: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java (added)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java Thu May 29 10:35:52 2014
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding;
+
+import java.io.UnsupportedEncodingException;
+
+import junit.framework.TestCase;
+
+/**
+ * This class tests {@link PDFDocEncodingCharset} and indirectly {@link SingleByteCharset}.
+ * @version $Revision$
+ */
+public class PDFDocEncodingCharsetTest extends TestCase
+{
+
+    /**
+     * Tests {@link PDFDocEncodingCharset} and indirectly {@link SingleByteCharset}.
+     * @throws UnsupportedEncodingException if an encoding cannot be found
+     */
+    public void testEncoding() throws UnsupportedEncodingException
+    {
+        //TODO Use when switching to JavaSE-1.6
+        //Charset charset = PDFDocEncodingCharset.INSTANCE;
+
+        //Check basic round-trip
+        String text = "Test \u20AC$£ ;-) Gr\u00FCezi\u2026";
+        byte[] encoded = text.getBytes(PDFDocEncodingCharset.NAME);
+        int[] expected = new int[] {
+                0x54, 0x65, 0x73, 0x74, 0x20, //Test
+                0xA0, 0x24, 0xA3, 0x20, //Currency
+                0x3B, 0x2D, 0x29, 0x20, //Smiley
+                0x47, 0x72, 0xFC, 0x65, 0x7A, 0x69, //Hello in de_CH
+                0x83 //ellipsis
+        };
+        compareEncoded(encoded, expected);
+        String decoded = new String(encoded, PDFDocEncodingCharset.NAME);
+        assertEquals(text, decoded);
+
+        text = "Bad\u03C0\u2023char";
+        expected = new int[] {
+                0x42, 0x61, 0x64, 0x3F, 0x3F, 0x63, 0x68, 0x61, 0x72 //unencodable characters as '?'
+        };
+        encoded = text.getBytes(PDFDocEncodingCharset.NAME);
+        compareEncoded(encoded, expected);
+        decoded = new String(encoded, PDFDocEncodingCharset.NAME);
+        assertEquals("Bad??char", decoded);
+    }
+
+    private void compareEncoded(byte[] encoded, int[] expected)
+    {
+        assertEquals(expected.length, encoded.length);
+        for (int i = 0; i < expected.length; i++)
+        {
+            assertEquals("Bad character at pos " + i, (byte)(expected[i] & 0xFF), encoded[i]);
+        }
+    }
+
+}

Propchange: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/PDFDocEncodingCharsetTest.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html?rev=1598250&view=auto
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html (added)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html Thu May 29 10:35:52 2014
@@ -0,0 +1,25 @@
+<!--
+ ! Licensed to the Apache Software Foundation (ASF) under one or more
+ ! contributor license agreements.  See the NOTICE file distributed with
+ ! this work for additional information regarding copyright ownership.
+ ! The ASF licenses this file to You under the Apache License, Version 2.0
+ ! (the "License"); you may not use this file except in compliance with
+ ! the License.  You may obtain a copy of the License at
+ !
+ !      http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an "AS IS" BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !-->
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+
+</head>
+<body>
+These classes will be used to test encoding classes.
+</body>
+</html>

Propchange: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/encoding/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: pdfbox/branches/1.8/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java?rev=1598250&r1=1598249&r2=1598250&view=diff
==============================================================================
--- pdfbox/branches/1.8/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java (original)
+++ pdfbox/branches/1.8/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java Thu May 29 10:35:52 2014
@@ -162,6 +162,7 @@ public class PreflightParser extends Non
         }
     }
 
+    @Override
     public void parse() throws IOException
     {
         parse(Format.PDF_A1B);
@@ -335,6 +336,7 @@ public class PreflightParser extends Non
      * Same method than the {@linkplain PDFParser#parseXrefTable(long)} with additional controls : - EOL mandatory after
      * the 'xref' keyword - Cross reference subsection header uses single white space as separator - and so on
      */
+    @Override
     protected boolean parseXrefTable(long startByteOffset) throws IOException
     {
         if (pdfSource.peek() != 'x')
@@ -447,6 +449,7 @@ public class PreflightParser extends Non
      * Wraps the {@link NonSequentialPDFParser#parseCOSStream} to check rules on 'stream' and 'endstream' keywords.
      * {@link #checkStreamKeyWord()} and {@link #checkEndstreamKeyWord()}
      */
+    @Override
     protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException
     {
         checkStreamKeyWord();
@@ -555,9 +558,22 @@ public class PreflightParser extends Non
     /**
      * Check that the hexa string contains only an even number of Hexadecimal characters. Once it is done, reset the
      * offset at the beginning of the string and call {@link BaseParser#parseCOSString()}
+     * @deprecated Not needed anymore. Use {@link #COSString()} instead. PDFBOX-1437
      */
+    @Override
+    @Deprecated
     protected COSString parseCOSString(boolean isDictionary) throws IOException
     {
+        return parseCOSString();
+    }
+    
+    /**
+     * Check that the hexa string contains only an even number of Hexadecimal characters. Once it is done, reset the
+     * offset at the beginning of the string and call {@link BaseParser#parseCOSString()}
+     */
+    @Override
+    protected COSString parseCOSString() throws IOException
+    {
         // offset reminder
         long offset = pdfSource.getOffset();
         char nextChar = (char) pdfSource.read();
@@ -569,7 +585,7 @@ public class PreflightParser extends Non
                 nextChar = (char) pdfSource.read();
                 if (nextChar != '>')
                 {
-                    if (Character.digit((char) nextChar, 16) >= 0)
+                    if (Character.digit(nextChar, 16) >= 0)
                     {
                         count++;
                     }
@@ -591,7 +607,7 @@ public class PreflightParser extends Non
 
         // reset the offset to parse the COSString
         pdfSource.seek(offset);
-        COSString result = super.parseCOSString(isDictionary);
+        COSString result = super.parseCOSString();
 
         if (result.getString().length() > MAX_STRING_LENGTH)
         {
@@ -603,6 +619,7 @@ public class PreflightParser extends Non
     /**
      * Call {@link BaseParser#parseDirObject()} check limit range for Float, Integer and number of Dictionary entries.
      */
+    @Override
     protected COSBase parseDirObject() throws IOException
     {
         COSBase result = super.parseDirObject();
@@ -641,6 +658,7 @@ public class PreflightParser extends Non
         return result;
     }
 
+    @Override
     protected COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj)
             throws IOException
     {
@@ -863,6 +881,7 @@ public class PreflightParser extends Non
         return pdfObject.getObject();
     }
 
+    @Override
     protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff)
     {
         int offset = super.lastIndexOf(pattern, buf, endOff);