You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/12/12 22:50:41 UTC

svn commit: r1645083 - in /pdfbox/trunk: fontbox/src/main/java/org/apache/fontbox/ttf/ pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/

Author: jahewson
Date: Fri Dec 12 21:50:41 2014
New Revision: 1645083

URL: http://svn.apache.org/r1645083
Log:
PDFBOX-2524: Generate ToUnicode CMap for embedded CIDFontType2

Added:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java   (with props)
Modified:
    pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/CmapSubtable.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2Embedder.java

Modified: pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/CmapSubtable.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/CmapSubtable.java?rev=1645083&r1=1645082&r2=1645083&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/CmapSubtable.java (original)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/CmapSubtable.java Fri Dec 12 21:50:41 2014
@@ -541,16 +541,16 @@ public class CmapSubtable
     }
 
     /**
-     * Returns the character code for the given GID.
+     * Returns the character code for the given GID, or null if there is none.
      *
      * @param gid glyph id
      * @return character code
      */
-    public int getCharacterCode(int gid)
+    public Integer getCharacterCode(int gid)
     {
         if (gid < 0 || gid >= glyphIdToCharacterCode.length)
         {
-            return 0;
+            return null;
         }
         return glyphIdToCharacterCode[gid];
     }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2Embedder.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2Embedder.java?rev=1645083&r1=1645082&r2=1645083&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2Embedder.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2Embedder.java Fri Dec 12 21:50:41 2014
@@ -17,6 +17,8 @@
 
 package org.apache.pdfbox.pdmodel.font;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.IOException;
 
@@ -25,6 +27,7 @@ import org.apache.pdfbox.cos.COSDictiona
 import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.common.PDStream;
 
 /**
  * Embedded PDCIDFontType2 builder. Helper class to populate a PDCIDFontType2 and its parent
@@ -63,6 +66,51 @@ final class PDCIDFontType2Embedder exten
         COSArray descendantFonts = new COSArray();
         descendantFonts.add(cidFont);
         dict.setItem(COSName.DESCENDANT_FONTS, descendantFonts);
+
+        // ToUnicode CMap
+        dict.setItem(COSName.TO_UNICODE, createToUnicodeCMap(document));
+    }
+
+    private PDStream createToUnicodeCMap(PDDocument document) throws IOException
+    {
+        ToUnicodeWriter toUniWriter = new ToUnicodeWriter("Adobe", "Identity", 0);
+        boolean hasSurrogates = false;
+        for (int gid = 1, max = ttf.getMaximumProfile().getNumGlyphs(); gid <= max; gid++)
+        {
+            Integer codePoint = cmap.getCharacterCode(gid);
+            // skip composite glyph components that have no code point
+            if (codePoint != null)
+            {
+                if (codePoint > 0xFFFF)
+                {
+                    hasSurrogates = true;
+                }
+                toUniWriter.add(gid, new String(new int[]{ codePoint }, 0, 1));
+            }
+        }
+
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        toUniWriter.writeTo(out);
+        InputStream cMapStream = new ByteArrayInputStream(out.toByteArray());
+
+        // ToUnicode stream dictionary
+        PDStream stream = new PDStream(document, cMapStream, false);
+        stream.getStream().setItem(COSName.TYPE, COSName.CMAP);
+        stream.getStream().setName(COSName.CMAPNAME, toUniWriter.getName());
+        stream.getStream().setItem(COSName.CIDSYSTEMINFO, toCIDSystemInfo("Adobe", "Identity", 0));
+        stream.addCompression();
+
+        // surrogate code points, requires PDF 1.5
+        if (hasSurrogates)
+        {
+            float version = document.getDocument().getVersion();
+            if (version < 1.5)
+            {
+                document.getDocument().setVersion(1.5f);
+            }
+        }
+
+        return stream;
     }
 
     private COSDictionary toCIDSystemInfo(String registry, String ordering, int supplement)

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java?rev=1645083&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java Fri Dec 12 21:50:41 2014
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdmodel.font;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import org.apache.pdfbox.util.Charsets;
+
+/**
+ * Writes ToUnicode Mapping Files.
+ *
+ * @author John Hewson
+ */
+final class ToUnicodeWriter
+{
+    private final String registry, ordering;
+    private final int supplement;
+    private final Map<Integer, String> cidToUnicode = new TreeMap<Integer, String>();
+    private int wMode;
+
+    /**
+     * Creates a new ToUnicode CMap writer.
+     *
+     * @param registry character collection registry
+     * @param ordering character ordering
+     * @param supplement character supplement
+     */
+    public ToUnicodeWriter(String registry, String ordering, int supplement)
+    {
+        this.registry = registry;
+        this.ordering = ordering;
+        this.supplement = supplement;
+        this.wMode = 0;
+    }
+
+    /**
+     * Sets the WMode (writing mode) of this CMap.
+     *
+     * @param wMode 1 for vertical, 0 for horizontal (default)
+     */
+    public void setWMode(int wMode)
+    {
+        this.wMode = wMode;
+    }
+
+    /**
+     * Adds the given CID to Unicode mapping.
+     *
+     * @param cid CID
+     * @param text Unicode text, up to 512 bytes.
+     */
+    public void add(int cid, String text)
+    {
+        if ( cid < 0 || cid > 0xFFFF)
+        {
+            throw new IllegalArgumentException("CID is not valid");
+        }
+
+        if (text == null || text.isEmpty())
+        {
+            throw new IllegalArgumentException("Text is null or empty");
+        }
+
+        cidToUnicode.put(cid, text);
+    }
+
+    /**
+     * Returns the name of the CMap.
+     */
+    public String getName()
+    {
+        return registry + "-" + ordering + "-UCS";
+    }
+
+    /**
+     * Writes the CMap as ASCII to the given output stream.
+     *
+     * @param out ASCII output stream
+     * @throws IOException if the stream could not be written
+     */
+    public void writeTo(OutputStream out) throws IOException
+    {
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, Charsets.US_ASCII));
+
+        writeLine(writer, "/CIDInit /ProcSet findresource begin");
+        writeLine(writer, "12 dict begin\n");
+
+        writeLine(writer, "begincmap");
+        writeLine(writer, "/CIDSystemInfo");
+        writeLine(writer, "<< /Registry (" + registry + ")");
+        writeLine(writer, "   /Ordering (" + ordering + ")");
+        writeLine(writer, "   /Supplement " + supplement);
+        writeLine(writer, ">> def\n");
+
+        writeLine(writer, "/CMapName /" + getName() + " def");
+        writeLine(writer, "/CMapType 2 def\n"); // 2 = ToUnicode
+
+        if (wMode != 0)
+        {
+            writeLine(writer, "/WMode /" + wMode + " def");
+        }
+
+        // ToUnicode always uses 16-bit CIDs
+        writeLine(writer, "1 begincodespacerange");
+        writeLine(writer, "<0000> <FFFF>");
+        writeLine(writer, "endcodespacerange\n");
+
+        // CID -> Unicode mappings, we use ranges to generate a smaller CMap
+        List<Integer> srcFrom = new ArrayList<Integer>();
+        List<Integer> srcTo = new ArrayList<Integer>();
+        List<String> dstString = new ArrayList<String>();
+
+        int srcPrev = -1;
+        String dstPrev = null;
+
+        for (Map.Entry<Integer, String> entry : cidToUnicode.entrySet())
+        {
+            int cid = entry.getKey();
+            String text = entry.getValue();
+
+            if (cid == srcPrev + 1 &&
+                dstPrev.codePointCount(0, dstPrev.length()) == 1 &&
+                text.codePointAt(0) == dstPrev.codePointAt(0) + 1)
+            {
+                // extend range
+                srcTo.set(srcTo.size() - 1, cid);
+            }
+            else
+            {
+                // begin range
+                srcFrom.add(cid);
+                srcTo.add(cid);
+                dstString.add(text);
+            }
+            srcPrev = cid;
+            dstPrev = text;
+        }
+
+        // limit of 100 entries per operator
+        int batchCount = (int)Math.ceil(srcFrom.size() / 100.0);
+        for (int batch = 0; batch < batchCount; batch++)
+        {
+            int count = batch == batchCount - 1 ? srcFrom.size() % 100 : 100;
+            writer.write(count + " beginbfrange\n");
+            for (int j = 0; j < count; j++)
+            {
+                int index = batch * 100 + j;
+                writer.write('<');
+                writer.write(toHex(srcFrom.get(index)));
+                writer.write("> ");
+
+                writer.write('<');
+                writer.write(toHex(srcTo.get(index)));
+                writer.write("> ");
+
+                writer.write("<");
+                writer.write(stringToHex(dstString.get(index)));
+                writer.write(">\n");
+            }
+            writeLine(writer, "endbfrange\n");
+        }
+
+        // footer
+        writeLine(writer, "endcmap");
+        writeLine(writer, "CMapName currentdict /CMap");
+        writeLine(writer, "end");
+        writeLine(writer, "end");
+
+        writer.flush();
+    }
+
+    private void writeLine(BufferedWriter writer, String text) throws IOException
+    {
+        writer.write(text);
+        writer.write('\n');
+    }
+
+    private String toHex(int num)
+    {
+        return String.format("%04X", num);
+    }
+
+    private String stringToHex(String text)
+    {
+        // use of non-BMP code points requires PDF 1.5 or later, otherwise we're limited to UCS-2
+        StringBuilder sb = new StringBuilder();
+        for (byte b : text.getBytes(Charsets.UTF_16BE))
+        {
+            sb.append(String.format("%02X", b));
+        }
+        return sb.toString();
+    }
+}

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ToUnicodeWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native