You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/09/02 21:17:27 UTC

svn commit: r992066 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/cos/ main/java/org/apache/pdfbox/encoding/ main/java/org/apache/pdfbox/encoding/conversion/ main/java/org/apache/pdfbox/pdmodel/font/ test/java/org/apache/pdfbox/util/ test/...

Author: lehmi
Date: Thu Sep  2 19:17:26 2010
New Revision: 992066

URL: http://svn.apache.org/viewvc?rev=992066&view=rev
Log:
PDFBOX-568: improved text extraction of sample_fonts_solidconvertor.pdf and cweb.pdf from our test arena

Added:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
    pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
    pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java Thu Sep  2 19:17:26 2010
@@ -835,8 +835,12 @@ public final class COSName extends COSBa
     /**
      * A common COSName value.
      */
+    public static final COSName SUPPLEMENT = new COSName( "Supplement" );
+    /**
+     * A common COSName value.
+     */
     public static final COSName SUBTYPE = new COSName( "Subtype" );
-
+    
     /**
      * "T"
      */

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java Thu Sep  2 19:17:26 2010
@@ -55,8 +55,7 @@ public class DictionaryEncoding extends 
         //for a nonsymbolic font, it is StandardEncoding, and for a symbolic font, it
         //is the font�s built-in encoding."
 
-        //so the default base encoding is standardEncoding
-        Encoding baseEncoding = new StandardEncoding();
+        Encoding baseEncoding = null;
         COSName baseEncodingName = (COSName)encoding.getDictionaryObject( COSName.BASE_ENCODING );
 
         if( baseEncodingName != null )
@@ -64,6 +63,11 @@ public class DictionaryEncoding extends 
             EncodingManager manager = new EncodingManager();
             baseEncoding = manager.getEncoding( baseEncodingName );
         }
+        else
+        {
+            //the default base encoding is standardEncoding
+            baseEncoding = new StandardEncoding();
+        }
         nameToCode.putAll( baseEncoding.nameToCode );
         codeToName.putAll( baseEncoding.codeToName );
 

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java Thu Sep  2 19:17:26 2010
@@ -44,6 +44,7 @@ public abstract class Encoding implement
      */
     private static final Log log = LogFactory.getLog(Encoding.class);
 
+    public static final String NOTDEF = ".notdef";
     /**
      * This is a mapping from a character code to a character name.
      */
@@ -80,7 +81,7 @@ public abstract class Encoding implement
             }
         }
 
-        NAME_TO_CHARACTER.put( ".notdef", "" );
+        NAME_TO_CHARACTER.put( NOTDEF, "" );
         NAME_TO_CHARACTER.put( "fi", "fi" );
         NAME_TO_CHARACTER.put( "fl", "fl" );
         NAME_TO_CHARACTER.put( "ffi", "ffi" );
@@ -188,7 +189,7 @@ public abstract class Encoding implement
      * @param code The character code that matches the character.
      * @param name The name of the character.
      */
-    protected void addCharacterEncoding( int code, String name )
+    public void addCharacterEncoding( int code, String name )
     {
         codeToName.put( code, name );
         nameToCode.put( name, code );
@@ -273,7 +274,7 @@ public abstract class Encoding implement
      *
      * @return The printable character for the code.
      */
-    public static String getCharacter( String name )
+    public String getCharacter( String name )
     {
         String character = NAME_TO_CHARACTER.get( name );
         if( character == null )
@@ -314,33 +315,10 @@ public abstract class Encoding implement
                     character = name;
                 }
             }
-            // this encoding is used in pdfs generated with TeX/LateX 
-            else if (name.length() <= 4 && (name.startsWith("x") || name.startsWith("a")) ) 
+            else if (nameToCode.containsKey(name)) 
             {
-                try 
-                {
-                    int value = Integer.parseInt(name.substring(1), (name.startsWith("x") ? 16 : 10));
-                    // add some additional mapping for values < 32 and = 127
-                    if (value >=0 && value <= 9) 
-                    {
-                        value += 161;
-                    }
-                    else if (value >= 10 && value < 32) 
-                    {
-                        value += 163;
-                    }
-                    else if ( value == 127)
-                    {
-                        value = 196;
-                    }
-                    character = Character.toString((char)value);
-                    NAME_TO_CHARACTER.put(name, character);
-                }
-                catch(NumberFormatException exception) 
-                {
-                    log.warn( "Not a number in character name: " + name );
-                    character = name;
-                }
+                int code = nameToCode.get(name);
+                character = Character.toString((char)code);
             }
             else 
             {
@@ -349,5 +327,5 @@ public abstract class Encoding implement
         }
         return character;
     } 
-
+    
 }

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java?rev=992066&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java Thu Sep  2 19:17:26 2010
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding;
+
+import org.apache.pdfbox.cos.COSBase;
+
+/**
+ * This class represents an encoding which was read from a type1 font.
+ * 
+ */
+public class Type1Encoding extends Encoding
+{
+    public Type1Encoding(int size)
+    {
+        for (int i=1;i<size;i++)
+        {
+            addCharacterEncoding(i, NOTDEF);
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public COSBase getCOSObject()
+    {
+        return null;
+    }
+
+}

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java Thu Sep  2 19:17:26 2010
@@ -64,6 +64,8 @@ public class CMapSubstitution 
         cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" );
         cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2");
 
+        cmapSubstitutions.put( "Adobe-Identity-0", "Identity-H");
+        cmapSubstitutions.put( "Adobe-Identity-1", "Identity-H");
     }
 
     /**

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Thu Sep  2 19:17:26 2010
@@ -16,17 +16,18 @@
  */
 package org.apache.pdfbox.pdmodel.font;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.fontbox.afm.AFMParser;
 import org.apache.fontbox.afm.FontMetric;
 import org.apache.fontbox.cmap.CMapParser;
 import org.apache.fontbox.cmap.CMap;
-import org.apache.pdfbox.encoding.conversion.EncodingConversionManager;
-import org.apache.pdfbox.encoding.conversion.EncodingConverter;
 
 import org.apache.pdfbox.encoding.AFMEncoding;
 import org.apache.pdfbox.encoding.DictionaryEncoding;
 import org.apache.pdfbox.encoding.Encoding;
 import org.apache.pdfbox.encoding.EncodingManager;
+import org.apache.pdfbox.encoding.Type1Encoding;
 import org.apache.pdfbox.encoding.conversion.CMapSubstitution;
 
 import org.apache.pdfbox.cos.COSArray;
@@ -69,6 +70,11 @@ public abstract class PDFont implements 
 {
 
     /**
+     * Log instance.
+     */
+    private static final Log log = LogFactory.getLog(PDFont.class);
+
+    /**
      * The cos dictionary for this font.
      */
     protected COSDictionary font;
@@ -82,10 +88,10 @@ public abstract class PDFont implements 
      * This is only used if this is a font object and it has an encoding and it is
      * a type0 font with a cmap.
      */
-    private CMap cmap = null;
+    protected CMap cmap = null;
 
-    private static Map<COSName, CMap> cmapObjects =
-        Collections.synchronizedMap( new HashMap<COSName, CMap>() );
+    private static Map<String, CMap> cmapObjects =
+        Collections.synchronizedMap( new HashMap<String, CMap>() );
 
     /**
      * The static map of the default Adobe font metrics.
@@ -112,11 +118,13 @@ public abstract class PDFont implements 
         return metrics;
     }
 
+    private static String resourceRootCMAP = "org/apache/pdfbox/resources/cmap/";
+    private static String resourceRootAFM = "org/apache/pdfbox/resources/afm/";
+
     private static void addAdobeFontMetric(
             Map<String, FontMetric> metrics, String name ) {
         try {
-            String resource =
-                "org/apache/pdfbox/resources/afm/" + name + ".afm";
+            String resource = resourceRootAFM + name + ".afm";
             InputStream afmStream = ResourceLoader.loadResource( resource );
             if( afmStream != null )
             {
@@ -134,11 +142,6 @@ public abstract class PDFont implements 
     }
 
     /**
-     * This will be set if the font has a toUnicode stream.
-     */
-    private boolean hasToUnicode = false;
-
-    /**
      * This will clear AFM resources that are stored statically.
      * This is usually not a problem unless you want to reclaim
      * resources for a long running process.
@@ -171,6 +174,125 @@ public abstract class PDFont implements 
     public PDFont( COSDictionary fontDictionary )
     {
         font = fontDictionary;
+        determineEncoding();
+    }
+
+    private void determineEncoding()
+    {
+        String cmapName = null;
+        COSName encodingName = null;
+        COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE );
+        COSBase encoding = getEncodingObject(); 
+        if( toUnicode != null )
+        {
+            if ( toUnicode instanceof COSStream )
+            {
+                try {
+                    parseCmap(null, ((COSStream)toUnicode).getUnfilteredStream(), null);
+                }
+                catch(IOException exception) 
+                {
+                    log.error("Error: Could not load embedded CMAP" );
+                }
+            }
+            else if ( toUnicode instanceof COSName)
+            {
+                encodingName = (COSName)toUnicode;
+                cmap = cmapObjects.get( encodingName.getName() );
+                if (cmap == null) 
+                {
+                    cmapName = encodingName.getName();
+                }
+            }
+        }
+        if (encoding != null) 
+        {
+            if (encoding instanceof COSName) 
+            {
+                if (cmap == null)
+                {
+                    encodingName = (COSName)encoding;
+                    cmap = cmapObjects.get( encodingName.getName() );
+                    if (cmap == null) 
+                    {
+                        cmapName = encodingName.getName();
+                    }
+                }
+                if (cmap == null && cmapName != null)
+                {
+                    EncodingManager manager = getEncodingManager();
+                    try 
+                    {
+                        fontEncoding = manager.getEncoding( encodingName );
+                    }
+                    catch(IOException exception) 
+                    {
+                        log.debug("Debug: Could not find encoding for " + encodingName );
+                    }
+                }
+            }
+            else if (encoding instanceof COSDictionary) 
+            {
+                try 
+                {
+                    fontEncoding = new DictionaryEncoding((COSDictionary)encoding);
+                }
+                catch(IOException exception) 
+                {
+                    log.error("Error: Could not create the DictionaryEncoding" );
+                }
+            }
+            else if(encoding instanceof COSStream )
+            {
+                if (cmap == null)
+                {
+                    COSStream encodingStream = (COSStream)encoding;
+                    try 
+                    {
+                        parseCmap( null, encodingStream.getUnfilteredStream(), null );
+                    }
+                    catch(IOException exception) 
+                    {
+                        log.error("Error: Could not parse the embedded CMAP" );
+                    }
+                }
+            }
+        }
+        COSDictionary cidsysteminfo = (COSDictionary)font.getDictionaryObject(COSName.CIDSYSTEMINFO);
+        if (cidsysteminfo != null) 
+        {
+            String ordering = cidsysteminfo.getString(COSName.ORDERING);
+            String registry = cidsysteminfo.getString(COSName.REGISTRY);
+            int supplement = cidsysteminfo.getInt(COSName.SUPPLEMENT);
+            cmapName = registry + "-" + ordering+ "-" + supplement;
+            cmapName = CMapSubstitution.substituteCMap( cmapName );
+            cmap = cmapObjects.get( cmapName );
+        }
+        FontMetric metric = getAFM();
+        if( metric != null )
+        {
+            fontEncoding = new AFMEncoding( metric );
+        }
+        
+        if (cmap == null && cmapName != null) 
+        {
+            String resourceName = resourceRootCMAP + cmapName;
+            try {
+                parseCmap( resourceRootCMAP, ResourceLoader.loadResource( resourceName ), encodingName );
+                if( cmap == null && encodingName == null)
+                {
+                    log.error("Error: Could not parse predefined CMAP file for '" + cmapName + "'" );
+                }
+            }
+            catch(IOException exception) 
+            {
+                log.error("Error: Could not find predefined CMAP file for '" + cmapName + "'" );
+            }
+        }
+//        if (fontEncoding == null)
+//        {
+            getEncodingFromFont();
+//        }
     }
 
     /**
@@ -316,9 +438,8 @@ public abstract class PDFont implements 
      *
      * @return The afm object from the name.
      *
-     * @throws IOException If there is an error getting the AFM object.
      */
-    protected FontMetric getAFM() throws IOException
+    protected FontMetric getAFM()
     {
         if(afm==null){
             COSBase baseFont = font.getDictionaryObject( COSName.BASE_FONT );
@@ -379,136 +500,27 @@ public abstract class PDFont implements 
     public String encode( byte[] c, int offset, int length ) throws IOException
     {
         String retval = null;
-        if( isTypeFont() )
+        if( cmap != null )
         {
-            if( cmap == null )
+            if (length == 1 && cmap.hasOneByteMappings()) 
             {
-                COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE );
-                if( toUnicode instanceof COSStream )
-                {
-                    hasToUnicode = true;
-                    parseCmap( null, ((COSStream)toUnicode).getUnfilteredStream(), null );
-                }
-                else
-                {
-                    COSBase encoding = getEncodingObject();
-                    if( encoding instanceof COSStream )
-                    {
-                        COSStream encodingStream = (COSStream)encoding;
-                        parseCmap( null, encodingStream.getUnfilteredStream(), null );
-                    }
-                    else if( isType0Font() && encoding instanceof COSName )
-                    {
-                        COSName encodingName = (COSName)encoding;
-                        cmap = cmapObjects.get( encodingName );
-                        if( cmap == null )
-                        {
-                            String cmapName = encodingName.getName();
-                            if (encodingName.getName().equals( COSName.IDENTITY_H.getName() )) 
-                            {
-                                COSArray descendantFontArray =
-                                    (COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS );
-                                if (descendantFontArray != null) 
-                                {
-                                    COSDictionary descendantFontDictionary = 
-                                        (COSDictionary)descendantFontArray.getObject( 0 );
-                                    PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary );
-                                    COSDictionary cidsysteminfo = 
-                                        (COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO);
-                                    if (cidsysteminfo != null) 
-                                    {
-                                        String ordering = cidsysteminfo.getString(COSName.ORDERING);
-                                        String registry = cidsysteminfo.getString(COSName.REGISTRY);
-                                        cmapName = registry + "-" + ordering+"-UCS2";
-                                    }
-                                }
-                            } 
-                            else 
-                            {
-                                cmapName = CMapSubstitution.substituteCMap( cmapName );
-                            }
-
-                            String resourceRoot = "org/apache/pdfbox/resources/cmap/";
-                            String resourceName = resourceRoot + cmapName;
-                            parseCmap( resourceRoot, ResourceLoader.loadResource( resourceName ), encodingName );
-                            if( cmap == null && !encodingName.getName().equals( COSName.IDENTITY_H.getName() ) )
-                            {
-                                throw new IOException( "Error: Could not find predefined " +
-                                "CMAP file for '" + encodingName.getName() + "'" );
-                            }
-                        }
-                    }
-                    else if( encoding instanceof COSName ||
-                             encoding instanceof COSDictionary )
-                    {
-                        Encoding currentFontEncoding = getEncoding();
-                        if( currentFontEncoding != null )
-                        {
-                            retval = currentFontEncoding.getCharacter( getCodeFromArray( c, offset, length ) );
-                        }
-                    }
-                    else
-                    {
-                        COSDictionary fontDescriptor =
-                            (COSDictionary)font.getDictionaryObject( COSName.FONT_DESC );
-                        if( isTrueTypeFont() && fontDescriptor != null &&
-                            (fontDescriptor.getDictionaryObject( COSName.FONT_FILE )!= null ||
-                             fontDescriptor.getDictionaryObject( COSName.FONT_FILE2 ) != null ||
-                             fontDescriptor.getDictionaryObject( COSName.FONT_FILE3 ) != null ) )
-                        {
-                            //If we are using an embedded font then there is not much we can do besides
-                            //return the same character codes.
-                            //retval = new String( c,offset, length );
-                            retval = getStringFromArray( c, offset, length );
-                        }
-                        else
-                        {
-                            //this case will be handled below after checking the cmap
-                        }
-                    }
-                }
-
-
+                retval = cmap.lookup( c, offset, length );
             }
-        }
-        if( retval == null && cmap != null )
-        {
-            retval = cmap.lookup( c, offset, length );
-        }
-        
-        COSBase encodingCOS = getEncodingObject();
-        // The converter isn't needed if an unicode mapping is already given by the font dictionary  
-        if ( !hasToUnicode && encodingCOS instanceof COSName ) 
-        {
-            EncodingConverter converter = EncodingConversionManager.getConverter(((COSName)encodingCOS).getName());
-            if ( converter != null ) 
+            else if (length == 2 && cmap.hasTwoByteMappings())
             {
-                if ( retval != null )
-                {
-                    retval = converter.convertString(retval);
-                }
-                else
-                {
-                    retval = converter.convertBytes(c, offset, length, cmap);
-                }
-                return retval;
+                retval = cmap.lookup( c, offset, length );
             }
         }
         
-        //if we havn't found a value yet and
-        //we are still on the first byte and
-        //there is no cmap or the cmap does not have 2 byte mappings then try to encode
-        //using fallback methods.
-        if( retval == null &&
-            length == 1 &&
-            (cmap == null || !cmap.hasTwoByteMappings()))
+        // there is no cmap but probably an encoding with a suitable mapping
+        if( retval == null && length == 1)
         {
             Encoding encoding = getEncoding();
             if( encoding != null )
             {
                 retval = encoding.getCharacter( getCodeFromArray( c, offset, length ) );
             }
-            if( retval == null )
+            if( retval == null && cmap == null)
             {
                 retval = getStringFromArray( c, offset, length );
             }
@@ -548,16 +560,20 @@ public abstract class PDFont implements 
         return retval;
     }
 
-    private void parseCmap( String cmapRoot, InputStream cmapStream, COSName encodingName ) throws IOException
+    private void parseCmap( String cmapRoot, InputStream cmapStream, COSName encodingName )
     {
         if( cmapStream != null )
         {
             CMapParser parser = new CMapParser();
-            cmap = parser.parse( cmapRoot, cmapStream );
-            if( encodingName != null )
+            try 
             {
-                cmapObjects.put( encodingName, cmap );
+                cmap = parser.parse( cmapRoot, cmapStream );
+                if( encodingName != null )
+                {
+                    cmapObjects.put( encodingName.getName(), cmap );
+                }
             }
+            catch (IOException exception) {}
         }
     }
 
@@ -583,61 +599,6 @@ public abstract class PDFont implements 
      */
     public Encoding getEncoding() throws IOException
     {
-        if( fontEncoding == null )
-        {
-            EncodingManager manager = getEncodingManager();
-            COSBase encoding = getEncodingObject(); //font.getDictionaryObject( COSName.ENCODING );
-            if( encoding == null )
-            {
-                FontMetric metric = getAFM();
-                if( metric != null )
-                {
-                    fontEncoding = new AFMEncoding( metric );
-                }
-                if( fontEncoding == null )
-                {
-                    fontEncoding = manager.getStandardEncoding();
-                }
-            }
-            /**
-             * Si la cl� /Encoding existe dans le dictionnaire fonte il y a deux possibilit�s :
-             * 1er cas : elle est associ� � une reference contenant un dictionnaire de type encoding.
-             * Ce dictionnaire PDF est repr�sent� par un DictionaryEncoding.
-             * If the /Encoding Key does exist in the font dictionary, there are two cases :
-             * case one : The value associated with /Encoding is a reference to a dictionary.
-             * This dictionary is represented by an instance of DictionaryEncoding class
-             */
-            else if( encoding instanceof COSDictionary )
-            {
-                COSDictionary encodingDic = (COSDictionary)encoding;
-                //Let's see if the encoding dictionary has a base encoding
-                //If it does not then we will attempt to get it from the font
-                //file
-                COSName baseEncodingName = (COSName) encodingDic.getDictionaryObject(
-                    COSName.BASE_ENCODING);
-                //on ajoute une entr�e /BaseEncoding dans /Encoding uniquement si elle en est absente
-                //if not find in Encoding dictinary target, we try to find it from else where
-                if( baseEncodingName == null)
-                {
-                    COSName fontEncodingFromFile = getEncodingFromFont();
-                    encodingDic.setItem(
-                        COSName.BASE_ENCODING,
-                        fontEncodingFromFile );
-                }
-                fontEncoding = new DictionaryEncoding( encodingDic );
-            }
-            else if( encoding instanceof COSName )
-            {
-                if( !encoding.equals( COSName.IDENTITY_H ) )
-                {
-                    fontEncoding = manager.getEncoding( (COSName)encoding );
-                }
-            }
-            else
-            {
-                throw new IOException( "Unexpected encoding type:" + encoding.getClass().getName() );
-            }
-        }
         return fontEncoding;
     }
 
@@ -653,7 +614,7 @@ public abstract class PDFont implements 
 
     // Memorized values to avoid repeated dictionary lookups
     private String subtype = null;
-    private boolean type0Font;
+    private boolean type1Font;
     private boolean trueTypeFont;
     private boolean typeFont;
 
@@ -666,16 +627,16 @@ public abstract class PDFont implements 
     {
         if (subtype == null) {
             subtype = font.getNameAsString( COSName.SUBTYPE );
-            type0Font = "Type0".equals(subtype);
+            type1Font = "Type1".equals(subtype);
             trueTypeFont = "TrueType".equals(subtype);
-            typeFont = type0Font || "Type1".equals(subtype) || trueTypeFont;
+            typeFont = type1Font || "Type0".equals(subtype) || trueTypeFont;
         }
         return subtype;
     }
 
-    private boolean isType0Font() {
+    private boolean isType1Font() {
         getSubType();
-        return type0Font;
+        return type1Font;
     }
 
     private boolean isTrueTypeFont() {
@@ -799,80 +760,85 @@ public abstract class PDFont implements 
     }
 
     /**
-     * Try to get the encoding for the font and add it to the target
-     * the target must be an an Encoding Dictionary.
+     * Tries to get the encoding for the type1 font.
      *
-     * added by Christophe Huault : DGBS Strasbourg huault@free.fr october 2004
-     *
-     * @return The encoding from the font.
-     *
-     * @throws IOException If there is an error reading the file.
      */
-    private COSName getEncodingFromFont() throws IOException
-    {
-        //This whole section of code needs to be replaced with an actual
-        //type1 font parser!!
-
-
-        COSName retvalue = null;
-        //recuperer le programme de fonte dans son stream qui doit se trouver
-        //dans le flux r�f�renc� par � la cl� FileFont lui m�me situ� dans
-        //le dictionnaire associ� � /FontDescriptor du dictionnaire de type /Font courrant
-        //get the font program in the stream which should be located in
-         //the /FileFont Stream object himself in the /FontDescriptior of the current
-        //font dictionary
-        COSDictionary fontDescriptor = (COSDictionary) font.getDictionaryObject(
-            COSName.FONT_DESC);
-        if( fontDescriptor != null )
-        {
-            COSStream fontFile = (COSStream) fontDescriptor.getDictionaryObject(
-                COSName.FONT_FILE);
-            if( fontFile != null )
-            {
-                BufferedReader in =
-                        new BufferedReader(new InputStreamReader(fontFile.getUnfilteredStream()));
-                /**
-                 * this section parse the FileProgram stream searching for a /Encoding entry
-                 * the research stop if the entry "currentdict end" is reach or after 100 lignes
-                 */
-                StringTokenizer st = null;
-                boolean found = false;
-                String line = "";
-                String key = null;
-                for( int i = 0; null!=( line = in.readLine() ) &&
-                                i < 40  &&
-                                !line.equals("currentdict end")
-                                && !found; i++)
+    private void getEncodingFromFont()
+    {
+        // This whole section of code needs to be replaced with an actual type1 font parser!!
+        // Get the font program from the embedded type font.
+        if (isType1Font()) {
+            COSDictionary fontDescriptor = (COSDictionary) font.getDictionaryObject(
+                COSName.FONT_DESC);
+            if( fontDescriptor != null )
+            {
+                COSStream fontFile = (COSStream) fontDescriptor.getDictionaryObject(
+                    COSName.FONT_FILE);
+                if( fontFile != null )
                 {
-                    st = new StringTokenizer(line);
-                    if( st.hasMoreTokens() )
+                    try 
                     {
-                        key = st.nextToken();
-                        if(key.equals("/Encoding") && st.hasMoreTokens() )
+                        BufferedReader in =
+                                new BufferedReader(new InputStreamReader(fontFile.getUnfilteredStream()));
+                        
+                        // this section parses the font program stream searching for a /Encoding entry
+                        // if it contains an array of values a Type1Encoding will be returned
+                        // if it encoding contains an encoding name the corresponding Encoding will be returned
+                        String line = "";
+                        Type1Encoding encoding = null;
+                        while( (line = in.readLine()) != null)
                         {
-                            COSName value = COSName.getPDFName( st.nextToken() );
-                            found = true;
-                            if( value.equals( COSName.MAC_ROMAN_ENCODING ) ||
-                                value.equals( COSName.PDF_DOC_ENCODING ) ||
-                                value.equals( COSName.STANDARD_ENCODING ) ||
-                                value.equals( COSName.WIN_ANSI_ENCODING ) )
+                            if (line.startsWith("currentdict end")) {
+                                if (encoding != null)
+                                    fontEncoding = encoding;
+                                break;
+                            }
+                            if (line.startsWith("/Encoding")) 
                             {
-                                //value is expected to be one of the encodings
-                                //ie. StandardEncoding,WinAnsiEncoding,MacRomanEncoding,PDFDocEncoding
-                                retvalue = value;
+                                if(line.endsWith("array")) 
+                                {
+                                    StringTokenizer st = new StringTokenizer(line);
+                                    // ignore the first token
+                                    st.nextElement();
+                                    int arraySize = Integer.parseInt(st.nextToken());
+                                    encoding = new Type1Encoding(arraySize);
+                                }
+                                // if there is already an encoding, we don't need to
+                                // assign another one
+                                else if (fontEncoding == null)
+                                {
+                                    StringTokenizer st = new StringTokenizer(line);
+                                    // ignore the first token
+                                    st.nextElement();
+                                    String type1Encoding = st.nextToken();
+                                    fontEncoding = getEncodingManager().getEncoding(COSName.getPDFName(type1Encoding));
+                                    break;
+                                }
+                            }
+                            else if (line.startsWith("dup")) {
+                                StringTokenizer st = new StringTokenizer(line);
+                                // ignore the first token
+                                st.nextElement();
+                                int index = Integer.parseInt(st.nextToken());
+                                String name = st.nextToken();
+                                encoding.addCharacterEncoding(index, name.replace("/", ""));
                             }
                         }
+                        in.close();
+                    }
+                    catch(IOException exception) 
+                    {
+                        log.error("Error: Could not extract the encoding from the embedded type1 font.");
                     }
                 }
             }
         }
-        return retvalue;
     }
 
     /**
-     * This will get the fonts bouding box.
+     * This will get the fonts bounding box.
      *
-     * @return The fonts bouding box.
+     * @return The fonts bounding box.
      *
      * @throws IOException If there is an error getting the bounding box.
      */

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Thu Sep  2 19:17:26 2010
@@ -307,14 +307,7 @@ public class TestTextStripper extends Te
                 }
                 if (!stringsEqual(expectedLine, actualLine))
                 {
-                    // PDFBOX-568: testextract failure on Linux and Mac OS X
-                    // Don't flag a test failure that we already know about.
-                    // TODO: Remove this check once PDFBOX-568 is fixed.
-                    if (!"sample_fonts_solidconvertor.pdf".equals(inFile.getName())) 
-                    {
-                        this.bFail = true;
-                    }
-
+                    this.bFail = true;
                     log.error("FAILURE: Line mismatch for file " + inFile.getName() +
                             " ( sort = "+bSort+")" +
                             " at expected line: " + expectedReader.getLineNumber() +

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
Binary files - no diff available.

Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt?rev=992066&r1=992065&r2=992066&view=diff
==============================================================================
Binary files - no diff available.