You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@pdfbox.apache.org by kl...@lhsystems.com on 2014/10/08 17:51:10 UTC

Contribution to pdfbox

Dear  developer-Team,
I'm evaluating PDFbox 1.8.6 for the creation of a pdf file for an editor.

At one point I got stuck, due to the missing of the correct characters from the PostScript font. The first characters I was missing were the German Umlaute.  From studying the coding of the relevant Java class "PDType1AfmPfbFont" I found out, that the encoding  from the font file, was not transfer to the pdf-file.

I made a change to the class, so the encoding from the afm-file will be transferred to the pdf-file. I rebuild the project and now I get the correct characters. I wonder, if you would like to put the changes to your project.


The changed class is here. I erased some special handling for the German Umlaut and added an encoding dictionary (red lines).

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdmodel.font;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.fontbox.afm.AFMParser;
import org.apache.fontbox.afm.CharMetric;
import org.apache.fontbox.afm.FontMetric;
import org.apache.fontbox.pfb.PfbParser;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.encoding.AFMEncoding;
import org.apache.pdfbox.encoding.DictionaryEncoding;
import org.apache.pdfbox.encoding.Encoding;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;

/**
* This is implementation of the Type1 Font with a afm and a pfb file.
*
 * @author <a href="mailto:m.g.n@gmx.de">Michael Niedermair</a>
* @version $Revision: 1.5 $
*/
public class PDType1AfmPfbFont extends PDType1Font
{
    /**
     * the buffersize.
     */
    private static final int BUFFERSIZE = 0xffff;

    /**
     * The font metric.
     */
    private FontMetric metric;

    /**
     * The font encoding dictionary.
     */
    protected COSDictionary fontEncodingDic;

    /**
     * Create a new object.
     *
     * @param doc The PDF document that will hold the embedded font.
     * @param afmname The font filename.
     * @throws IOException If there is an error loading the data.
     */
    public PDType1AfmPfbFont(final PDDocument doc, final String afmname) throws IOException
    {

        super();
        InputStream afmin = new BufferedInputStream(new FileInputStream(afmname), BUFFERSIZE);
        String pfbname = afmname.replaceAll(".AFM", "").replaceAll(".afm", "") + ".pfb";
        InputStream pfbin = new BufferedInputStream(new FileInputStream(pfbname), BUFFERSIZE);
        load(doc, afmin, pfbin);
    }

    /**
     * Create a new object.
     *
     * @param doc The PDF document that will hold the embedded font.
     * @param afm The afm input.
     * @param pfb The pfb input.
     * @throws IOException If there is an error loading the data.
     */
    public PDType1AfmPfbFont(final PDDocument doc, final InputStream afm, final InputStream pfb) throws IOException
    {
        super();
        load(doc, afm, pfb);
    }

    /**
     * This will load a afm and pfb to be embedding into a document.
     *
     * @param doc The PDF document that will hold the embedded font.
     * @param afm The afm input.
     * @param pfb The pfb input.
     * @throws IOException If there is an error loading the data.
     */
    private void load(final PDDocument doc, final InputStream afm, final InputStream pfb) throws IOException
    {
        fontEncodingDic = null;
        PDFontDescriptorDictionary fd = new PDFontDescriptorDictionary();
        setFontDescriptor(fd);

        // read the pfb
        PfbParser pfbparser = new PfbParser(pfb);
        pfb.close();

        PDStream fontStream = new PDStream(doc, pfbparser.getInputStream(), false);
        fontStream.getStream().setInt("Length", pfbparser.size());
        for (int i = 0; i < pfbparser.getLengths().length; i++)
        {
            fontStream.getStream().setInt("Length" + (i + 1), pfbparser.getLengths()[i]);
        }
        fontStream.addCompression();
        fd.setFontFile(fontStream);

        // read the afm
        AFMParser parser = new AFMParser(afm);
        parser.parse();
        metric = parser.getResult();
        setFontEncoding(afmToDictionary(new AFMEncoding(metric)));

        // set the values
        setBaseFont(metric.getFontName());
        fd.setFontName(metric.getFontName());
        fd.setFontFamily(metric.getFamilyName());
        fd.setNonSymbolic(true);
        fd.setFontBoundingBox(new PDRectangle(metric.getFontBBox()));
        fd.setItalicAngle(metric.getItalicAngle());
        fd.setAscent(metric.getAscender());
        fd.setDescent(metric.getDescender());
        fd.setCapHeight(metric.getCapHeight());
        fd.setXHeight(metric.getXHeight());
        fd.setAverageWidth(metric.getAverageCharacterWidth());
        fd.setCharacterSet(metric.getCharacterSet());

        // get firstchar, lastchar
        int firstchar = 255;
        int lastchar = 0;

        // widths
        List<CharMetric> listmetric = metric.getCharMetrics();
        Encoding encoding = getFontEncoding();
        int maxWidths = 256;
        List<Float> widths = new ArrayList<Float>(maxWidths);
        int zero = 250;
        Iterator<CharMetric> iter = listmetric.iterator();
        for (int i = 0; i < maxWidths; i++)
        {
            widths.add((float)zero);
        }
        while (iter.hasNext())
        {
            CharMetric m = iter.next();
            int n = m.getCharacterCode();
            if (n > 0)
            {
                firstchar = Math.min(firstchar, n);
                lastchar = Math.max(lastchar, n);
                if (m.getWx() > 0)
                {
                    int width = Math.round(m.getWx());
                    widths.set(n, (float)width);
                    // germandbls has 2 character codes !! Don't ask me why .....
                    // StandardEncoding = 0373 = 251
                    // WinANSIEncoding = 0337 = 223
                    if (m.getName().equals("germandbls") && n != 223)
                    {
                        widths.set(0337, (float)width);
                    }
                }
            }
        }
        setFirstChar(0);
        setLastChar(255);
        setWidths(widths);
    }

    /*
     * This will generate a Encoding from the AFM-Encoding, because the AFM-Enconding isn't exported to the pdf and
     * consequently the StandardEncoding is used so that any special character is missing I've copied the code from the
     * pdfbox-forum posted by V0JT4 and made some additions concerning german umlauts see also
     * https://sourceforge.net/forum/message.php?msg_id=4705274
     */
    private DictionaryEncoding afmToDictionary(AFMEncoding encoding) throws java.io.IOException
    {
        COSArray array = new COSArray();
        COSArray differEncoding = new COSArray();
        array.add(COSInteger.ZERO);
        for (int i = 0; i < 256; i++)
        {
                String  name = encoding.getName(i);
                COSName cosName = COSName.getPDFName(name);
                if ( name!=null )
                {
                  COSNumber cosDifferCode = COSNumber.get(new Integer(i).toString());
                  COSName cosDifferName = COSName.getPDFName(name);
                  differEncoding.add( cosDifferCode );
                  differEncoding.add( cosDifferName );
                }
            array.add(cosName);
        }

        COSDictionary dictionary = new COSDictionary();
        dictionary.setItem(COSName.NAME, COSName.ENCODING);
        dictionary.setItem(COSName.DIFFERENCES, array);
        dictionary.setItem(COSName.BASE_ENCODING, COSName.STANDARD_ENCODING);

        if ( differEncoding.size()> 0 )
        {
                        fontEncodingDic = new COSDictionary();
                        fontEncodingDic.setItem(COSName.TYPE, COSName.ENCODING);
                        fontEncodingDic.setItem(COSName.DIFFERENCES, differEncoding);

                        COSDictionary afont = (COSDictionary) getCOSObject();
                        afont.setItem(COSName.ENCODING, fontEncodingDic);
        }

        return new DictionaryEncoding(dictionary);
    }
    /**
     * The encoding dirctionary
     *
     * @return The dictionary of encoding
     *
     */
    public COSDictionary getFontEncodingDic()
    {
        return fontEncodingDic;
    }
    @Override
    public void clear()
    {
        super.clear();
        metric = null;
    }
}



Best regard
Klaus Graaf

Lufthansa Systems
Dr. Klaus Graaf
Schützenwall 1
D-22844 Norderstedt

Büro:       +49-40-5070-6849
Fax:        +49-40-5070-7880
Handy:    +49-151-58920261
Internet:  http://www.lhsystems.com<http://www.lhsystems.com/>
Email:     mailto:Klaus.Graaf@lhsystems.com


 
Sitz der Gesellschaft / Corporate Headquarters: Lufthansa Systems AS GmbH, Norderstedt, Registereintragung / Registration: Amtsgericht Norderstedt 3688NO
Geschaeftsfuehrung / Management Board: Bernd Appel

Re: Contribution to pdfbox

Posted by John Hewson <jo...@jahewson.com>.

Hi Klaus

Thanks, can you open an issue on JIRA at https://issues.apache.org/jira/browse/pdfbox
and if possible attach the file as an SVN patch (see https://ariejan.net/2007/07/03/how-to-create-and-apply-a-patch-with-subversion/) via More > Attach Files.

We might want to add some similar code to the trunk as well as 1.8, but I’ll figure that out later.

Thanks

-- John

On 8 Oct 2014, at 08:51, klaus.graaf@lhsystems.com wrote:

> Dear  developer-Team,
> I'm evaluating PDFbox 1.8.6 for the creation of a pdf file for an editor.
> 
> At one point I got stuck, due to the missing of the correct characters from the PostScript font. The first characters I was missing were the German Umlaute.  From studying the coding of the relevant Java class "PDType1AfmPfbFont" I found out, that the encoding  from the font file, was not transfer to the pdf-file.
> 
> I made a change to the class, so the encoding from the afm-file will be transferred to the pdf-file. I rebuild the project and now I get the correct characters. I wonder, if you would like to put the changes to your project.
> 
> 
> The changed class is here. I erased some special handling for the German Umlaut and added an encoding dictionary (red lines).
> 
> /*
> * Licensed to the Apache Software Foundation (ASF) under one or more
> * contributor license agreements.  See the NOTICE file distributed with
> * this work for additional information regarding copyright ownership.
> * The ASF licenses this file to You under the Apache License, Version 2.0
> * (the "License"); you may not use this file except in compliance with
> * the License.  You may obtain a copy of the License at
> *
> *      http://www.apache.org/licenses/LICENSE-2.0
> *
> * Unless required by applicable law or agreed to in writing, software
> * distributed under the License is distributed on an "AS IS" BASIS,
> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> * See the License for the specific language governing permissions and
> * limitations under the License.
> */
> package org.apache.pdfbox.pdmodel.font;
> 
> import java.io.BufferedInputStream;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
> import java.util.Iterator;
> import java.util.List;
> 
> import org.apache.fontbox.afm.AFMParser;
> import org.apache.fontbox.afm.CharMetric;
> import org.apache.fontbox.afm.FontMetric;
> import org.apache.fontbox.pfb.PfbParser;
> import org.apache.pdfbox.cos.COSArray;
> import org.apache.pdfbox.cos.COSDictionary;
> import org.apache.pdfbox.cos.COSInteger;
> import org.apache.pdfbox.cos.COSName;
> import org.apache.pdfbox.cos.COSNumber;
> import org.apache.pdfbox.encoding.AFMEncoding;
> import org.apache.pdfbox.encoding.DictionaryEncoding;
> import org.apache.pdfbox.encoding.Encoding;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.pdmodel.common.PDRectangle;
> import org.apache.pdfbox.pdmodel.common.PDStream;
> 
> /**
> * This is implementation of the Type1 Font with a afm and a pfb file.
> *
> * @author <a href="mailto:m.g.n@gmx.de">Michael Niedermair</a>
> * @version $Revision: 1.5 $
> */
> public class PDType1AfmPfbFont extends PDType1Font
> {
>    /**
>     * the buffersize.
>     */
>    private static final int BUFFERSIZE = 0xffff;
> 
>    /**
>     * The font metric.
>     */
>    private FontMetric metric;
> 
>    /**
>     * The font encoding dictionary.
>     */
>    protected COSDictionary fontEncodingDic;
> 
>    /**
>     * Create a new object.
>     *
>     * @param doc The PDF document that will hold the embedded font.
>     * @param afmname The font filename.
>     * @throws IOException If there is an error loading the data.
>     */
>    public PDType1AfmPfbFont(final PDDocument doc, final String afmname) throws IOException
>    {
> 
>        super();
>        InputStream afmin = new BufferedInputStream(new FileInputStream(afmname), BUFFERSIZE);
>        String pfbname = afmname.replaceAll(".AFM", "").replaceAll(".afm", "") + ".pfb";
>        InputStream pfbin = new BufferedInputStream(new FileInputStream(pfbname), BUFFERSIZE);
>        load(doc, afmin, pfbin);
>    }
> 
>    /**
>     * Create a new object.
>     *
>     * @param doc The PDF document that will hold the embedded font.
>     * @param afm The afm input.
>     * @param pfb The pfb input.
>     * @throws IOException If there is an error loading the data.
>     */
>    public PDType1AfmPfbFont(final PDDocument doc, final InputStream afm, final InputStream pfb) throws IOException
>    {
>        super();
>        load(doc, afm, pfb);
>    }
> 
>    /**
>     * This will load a afm and pfb to be embedding into a document.
>     *
>     * @param doc The PDF document that will hold the embedded font.
>     * @param afm The afm input.
>     * @param pfb The pfb input.
>     * @throws IOException If there is an error loading the data.
>     */
>    private void load(final PDDocument doc, final InputStream afm, final InputStream pfb) throws IOException
>    {
>        fontEncodingDic = null;
>        PDFontDescriptorDictionary fd = new PDFontDescriptorDictionary();
>        setFontDescriptor(fd);
> 
>        // read the pfb
>        PfbParser pfbparser = new PfbParser(pfb);
>        pfb.close();
> 
>        PDStream fontStream = new PDStream(doc, pfbparser.getInputStream(), false);
>        fontStream.getStream().setInt("Length", pfbparser.size());
>        for (int i = 0; i < pfbparser.getLengths().length; i++)
>        {
>            fontStream.getStream().setInt("Length" + (i + 1), pfbparser.getLengths()[i]);
>        }
>        fontStream.addCompression();
>        fd.setFontFile(fontStream);
> 
>        // read the afm
>        AFMParser parser = new AFMParser(afm);
>        parser.parse();
>        metric = parser.getResult();
>        setFontEncoding(afmToDictionary(new AFMEncoding(metric)));
> 
>        // set the values
>        setBaseFont(metric.getFontName());
>        fd.setFontName(metric.getFontName());
>        fd.setFontFamily(metric.getFamilyName());
>        fd.setNonSymbolic(true);
>        fd.setFontBoundingBox(new PDRectangle(metric.getFontBBox()));
>        fd.setItalicAngle(metric.getItalicAngle());
>        fd.setAscent(metric.getAscender());
>        fd.setDescent(metric.getDescender());
>        fd.setCapHeight(metric.getCapHeight());
>        fd.setXHeight(metric.getXHeight());
>        fd.setAverageWidth(metric.getAverageCharacterWidth());
>        fd.setCharacterSet(metric.getCharacterSet());
> 
>        // get firstchar, lastchar
>        int firstchar = 255;
>        int lastchar = 0;
> 
>        // widths
>        List<CharMetric> listmetric = metric.getCharMetrics();
>        Encoding encoding = getFontEncoding();
>        int maxWidths = 256;
>        List<Float> widths = new ArrayList<Float>(maxWidths);
>        int zero = 250;
>        Iterator<CharMetric> iter = listmetric.iterator();
>        for (int i = 0; i < maxWidths; i++)
>        {
>            widths.add((float)zero);
>        }
>        while (iter.hasNext())
>        {
>            CharMetric m = iter.next();
>            int n = m.getCharacterCode();
>            if (n > 0)
>            {
>                firstchar = Math.min(firstchar, n);
>                lastchar = Math.max(lastchar, n);
>                if (m.getWx() > 0)
>                {
>                    int width = Math.round(m.getWx());
>                    widths.set(n, (float)width);
>                    // germandbls has 2 character codes !! Don't ask me why .....
>                    // StandardEncoding = 0373 = 251
>                    // WinANSIEncoding = 0337 = 223
>                    if (m.getName().equals("germandbls") && n != 223)
>                    {
>                        widths.set(0337, (float)width);
>                    }
>                }
>            }
>        }
>        setFirstChar(0);
>        setLastChar(255);
>        setWidths(widths);
>    }
> 
>    /*
>     * This will generate a Encoding from the AFM-Encoding, because the AFM-Enconding isn't exported to the pdf and
>     * consequently the StandardEncoding is used so that any special character is missing I've copied the code from the
>     * pdfbox-forum posted by V0JT4 and made some additions concerning german umlauts see also
>     * https://sourceforge.net/forum/message.php?msg_id=4705274
>     */
>    private DictionaryEncoding afmToDictionary(AFMEncoding encoding) throws java.io.IOException
>    {
>        COSArray array = new COSArray();
>        COSArray differEncoding = new COSArray();
>        array.add(COSInteger.ZERO);
>        for (int i = 0; i < 256; i++)
>        {
>                String  name = encoding.getName(i);
>                COSName cosName = COSName.getPDFName(name);
>                if ( name!=null )
>                {
>                  COSNumber cosDifferCode = COSNumber.get(new Integer(i).toString());
>                  COSName cosDifferName = COSName.getPDFName(name);
>                  differEncoding.add( cosDifferCode );
>                  differEncoding.add( cosDifferName );
>                }
>            array.add(cosName);
>        }
> 
>        COSDictionary dictionary = new COSDictionary();
>        dictionary.setItem(COSName.NAME, COSName.ENCODING);
>        dictionary.setItem(COSName.DIFFERENCES, array);
>        dictionary.setItem(COSName.BASE_ENCODING, COSName.STANDARD_ENCODING);
> 
>        if ( differEncoding.size()> 0 )
>        {
>                        fontEncodingDic = new COSDictionary();
>                        fontEncodingDic.setItem(COSName.TYPE, COSName.ENCODING);
>                        fontEncodingDic.setItem(COSName.DIFFERENCES, differEncoding);
> 
>                        COSDictionary afont = (COSDictionary) getCOSObject();
>                        afont.setItem(COSName.ENCODING, fontEncodingDic);
>        }
> 
>        return new DictionaryEncoding(dictionary);
>    }
>    /**
>     * The encoding dirctionary
>     *
>     * @return The dictionary of encoding
>     *
>     */
>    public COSDictionary getFontEncodingDic()
>    {
>        return fontEncodingDic;
>    }
>    @Override
>    public void clear()
>    {
>        super.clear();
>        metric = null;
>    }
> }
> 
> 
> 
> Best regard
> Klaus Graaf
> 
> Lufthansa Systems
> Dr. Klaus Graaf
> Schützenwall 1
> D-22844 Norderstedt
> 
> Büro:       +49-40-5070-6849
> Fax:        +49-40-5070-7880
> Handy:    +49-151-58920261
> Internet:  http://www.lhsystems.com<http://www.lhsystems.com/>
> Email:     mailto:Klaus.Graaf@lhsystems.com
> 
> 
>  
> Sitz der Gesellschaft / Corporate Headquarters: Lufthansa Systems AS GmbH, Norderstedt, Registereintragung / Registration: Amtsgericht Norderstedt 3688NO
> Geschaeftsfuehrung / Management Board: Bernd Appel
> 
>