You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2021/02/17 16:35:41 UTC

svn commit: r1886626 - in /pdfbox/trunk: fontbox/src/main/java/org/apache/fontbox/cmap/ pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/ pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/

Author: tilman
Date: Wed Feb 17 16:35:40 2021
New Revision: 1886626

URL: http://svn.apache.org/viewvc?rev=1886626&view=rev
Log:
PDFBOX-5103: allow reuse of subsetted fonts by inverting the ToUnicode CMap

Modified:
    pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java

Modified: pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java (original)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java Wed Feb 17 16:35:40 2021
@@ -64,6 +64,9 @@ public class CMap
     private final Map<Integer, Map<Integer, Integer>> codeToCid = new HashMap<>();
     private final List<CIDRange> codeToCidRanges = new ArrayList<>();
 
+    // inverted map
+    Map <String, byte[]> unicodeToByteCodes = new HashMap<>();
+
     private static final String SPACE = " ";
     private int spaceMapping = -1;
 
@@ -120,6 +123,7 @@ public class CMap
      * Returns the sequence of Unicode characters for the given character code.
      *
      * @param code character code
+     * @param length code length
      * @return Unicode characters (may be more than one, e.g "fi" ligature)
      */
     public String toUnicode(int code, int length)
@@ -356,6 +360,7 @@ public class CMap
      */
     void addCharMapping(byte[] codes, String unicode)
     {
+        unicodeToByteCodes.put(unicode, codes.clone()); // clone needed, bytes is modified later
         int code = getCodeFromArray(codes, 0, codes.length);
         if (codes.length == 1)
         {
@@ -377,6 +382,17 @@ public class CMap
     }
 
     /**
+     * Get the code bytes for an unicode string.
+     *
+     * @param unicode
+     * @return the code bytes or null if there is none.
+     */
+    public byte[] getCodesFromUnicode(String unicode)
+    {
+        return unicodeToByteCodes.get(unicode);
+    }
+
+    /**
      * This will add a CID mapping.
      *
      * @param code character code
@@ -446,6 +462,16 @@ public class CMap
         cmap.codespaceRanges.forEach(this::addCodespaceRange);
         charToUnicodeOneByte.putAll(cmap.charToUnicodeOneByte);
         charToUnicodeTwoBytes.putAll(cmap.charToUnicodeTwoBytes);
+        cmap.charToUnicodeOneByte.entrySet().forEach(entry ->
+        {
+            unicodeToByteCodes.put(entry.getValue(), new byte[] {(byte) (entry.getKey() % 0xFF)});
+        });
+        cmap.charToUnicodeTwoBytes.entrySet().forEach(entry ->
+        {
+            Integer key = entry.getKey();
+            unicodeToByteCodes.put(entry.getValue(), 
+                    new byte[] {(byte) ((key >>> 8) & 0xFF), (byte) (key & 0xFF)});
+        });
         cmap.codeToCid.forEach((key, value) ->
         {
             if (codeToCid.containsKey(key))

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java Wed Feb 17 16:35:40 2021
@@ -357,8 +357,12 @@ public class PDCIDFontType2 extends PDCI
             // otherwise we require an explicit ToUnicode CMap
             if (cid == -1)
             {
-                //TODO: invert the ToUnicode CMap?
-                // see also PDFBOX-4233
+                CMap toUnicodeCMap = parent.getToUnicodeCMap();
+                byte[] codes = toUnicodeCMap.getCodesFromUnicode(Character.toString((char) unicode));
+                if (codes != null)
+                {
+                    return codes;
+                }
                 cid = 0;
             }
         }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Wed Feb 17 16:35:40 2021
@@ -109,6 +109,11 @@ public abstract class PDFont implements
         // standard 14 fonts use an AFM
         afmStandard14 = Standard14Fonts.getAFM(getName()); // may be null (it usually is)
         fontDescriptor = loadFontDescriptor();
+//        System.out.println(getName() + " " + isStandard14());
+//        if (isStandard14())
+//        toUnicodeCMap = loadUnicodeCmap();
+//        else
+//            toUnicodeCMap = null;
         toUnicodeCMap = loadUnicodeCmap();
     }
 
@@ -637,4 +642,14 @@ public abstract class PDFont implements
     {
         return getClass().getSimpleName() + " " + getName();
     }
+
+    /**
+     * Get the /ToUnicode CMap.
+     *
+     * @return The /ToUnicode CMap or null if there is none.
+     */
+    protected CMap getToUnicodeCMap()
+    {
+        return toUnicodeCMap;
+    }
 }

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java Wed Feb 17 16:35:40 2021
@@ -20,6 +20,7 @@ package org.apache.pdfbox.pdmodel.font;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNull;
 
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -34,6 +35,7 @@ import org.apache.pdfbox.pdmodel.PDDocum
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
+import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
 import org.apache.pdfbox.rendering.TestPDFToImage;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.junit.jupiter.api.BeforeEach;
@@ -314,4 +316,57 @@ class TestFontEmbedding
         PDFTextStripper stripper = new PDFTextStripper();
         return stripper.getText(document);
     }
+
+    /**
+     * Test that an embedded and subsetted font can be reused.
+     * 
+     * @throws IOException 
+     */
+    @Test
+    public void testReuseEmbeddedSubsettedFont() throws IOException
+    {
+        String text1 = "The quick brown fox";
+        String text2 = "xof nworb kciuq ehT";
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (PDDocument document = new PDDocument())
+        {
+            PDPage page = new PDPage();
+            document.addPage(page);
+            InputStream input = PDFont.class.getResourceAsStream(
+                    "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+            PDType0Font font = PDType0Font.load(document, input);
+            try (PDPageContentStream stream = new PDPageContentStream(document, page))
+            {
+                stream.beginText();
+                stream.setFont(font, 20);
+                stream.newLineAtOffset(50, 600);
+                stream.showText(text1);
+                stream.endText();
+            }
+            document.save(baos);
+        }
+        // Append, while reusing the font subset
+        try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+        {
+            PDPage page = document.getPage(0);
+            PDFont font = page.getResources().getFont(COSName.getPDFName("F1"));
+            try (PDPageContentStream stream = new PDPageContentStream(document, page, AppendMode.APPEND, true))
+            {
+                stream.beginText();
+                stream.setFont(font, 20);
+                stream.newLineAtOffset(250, 600);
+                stream.showText(text2);
+                stream.endText();
+            }
+            baos.reset();
+            document.save(baos);
+        }
+        // Test that both texts are there
+        try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+        {
+            PDFTextStripper stripper = new PDFTextStripper();
+            String extractedText = stripper.getText(document);
+            assertEquals(text1 + " " + text2, extractedText.trim());
+        }
+    }
 }