You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2021/02/17 16:35:41 UTC
svn commit: r1886626 - in /pdfbox/trunk:
fontbox/src/main/java/org/apache/fontbox/cmap/
pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/
pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/
Author: tilman
Date: Wed Feb 17 16:35:40 2021
New Revision: 1886626
URL: http://svn.apache.org/viewvc?rev=1886626&view=rev
Log:
PDFBOX-5103: allow reuse of subsetted fonts by inverting the ToUnicode CMap
Modified:
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
Modified: pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java (original)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java Wed Feb 17 16:35:40 2021
@@ -64,6 +64,9 @@ public class CMap
private final Map<Integer, Map<Integer, Integer>> codeToCid = new HashMap<>();
private final List<CIDRange> codeToCidRanges = new ArrayList<>();
+ // inverted map
+ Map <String, byte[]> unicodeToByteCodes = new HashMap<>();
+
private static final String SPACE = " ";
private int spaceMapping = -1;
@@ -120,6 +123,7 @@ public class CMap
* Returns the sequence of Unicode characters for the given character code.
*
* @param code character code
+ * @param length code length
* @return Unicode characters (may be more than one, e.g "fi" ligature)
*/
public String toUnicode(int code, int length)
@@ -356,6 +360,7 @@ public class CMap
*/
void addCharMapping(byte[] codes, String unicode)
{
+ unicodeToByteCodes.put(unicode, codes.clone()); // clone needed, bytes is modified later
int code = getCodeFromArray(codes, 0, codes.length);
if (codes.length == 1)
{
@@ -377,6 +382,17 @@ public class CMap
}
/**
+ * Get the code bytes for an unicode string.
+ *
+ * @param unicode
+ * @return the code bytes or null if there is none.
+ */
+ public byte[] getCodesFromUnicode(String unicode)
+ {
+ return unicodeToByteCodes.get(unicode);
+ }
+
+ /**
* This will add a CID mapping.
*
* @param code character code
@@ -446,6 +462,16 @@ public class CMap
cmap.codespaceRanges.forEach(this::addCodespaceRange);
charToUnicodeOneByte.putAll(cmap.charToUnicodeOneByte);
charToUnicodeTwoBytes.putAll(cmap.charToUnicodeTwoBytes);
+ cmap.charToUnicodeOneByte.entrySet().forEach(entry ->
+ {
+ unicodeToByteCodes.put(entry.getValue(), new byte[] {(byte) (entry.getKey() % 0xFF)});
+ });
+ cmap.charToUnicodeTwoBytes.entrySet().forEach(entry ->
+ {
+ Integer key = entry.getKey();
+ unicodeToByteCodes.put(entry.getValue(),
+ new byte[] {(byte) ((key >>> 8) & 0xFF), (byte) (key & 0xFF)});
+ });
cmap.codeToCid.forEach((key, value) ->
{
if (codeToCid.containsKey(key))
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java Wed Feb 17 16:35:40 2021
@@ -357,8 +357,12 @@ public class PDCIDFontType2 extends PDCI
// otherwise we require an explicit ToUnicode CMap
if (cid == -1)
{
- //TODO: invert the ToUnicode CMap?
- // see also PDFBOX-4233
+ CMap toUnicodeCMap = parent.getToUnicodeCMap();
+ byte[] codes = toUnicodeCMap.getCodesFromUnicode(Character.toString((char) unicode));
+ if (codes != null)
+ {
+ return codes;
+ }
cid = 0;
}
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Wed Feb 17 16:35:40 2021
@@ -109,6 +109,11 @@ public abstract class PDFont implements
// standard 14 fonts use an AFM
afmStandard14 = Standard14Fonts.getAFM(getName()); // may be null (it usually is)
fontDescriptor = loadFontDescriptor();
+// System.out.println(getName() + " " + isStandard14());
+// if (isStandard14())
+// toUnicodeCMap = loadUnicodeCmap();
+// else
+// toUnicodeCMap = null;
toUnicodeCMap = loadUnicodeCmap();
}
@@ -637,4 +642,14 @@ public abstract class PDFont implements
{
return getClass().getSimpleName() + " " + getName();
}
+
+ /**
+ * Get the /ToUnicode CMap.
+ *
+ * @return The /ToUnicode CMap or null if there is none.
+ */
+ protected CMap getToUnicodeCMap()
+ {
+ return toUnicodeCMap;
+ }
}
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1886626&r1=1886625&r2=1886626&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java Wed Feb 17 16:35:40 2021
@@ -20,6 +20,7 @@ package org.apache.pdfbox.pdmodel.font;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -34,6 +35,7 @@ import org.apache.pdfbox.pdmodel.PDDocum
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
+import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.rendering.TestPDFToImage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.jupiter.api.BeforeEach;
@@ -314,4 +316,57 @@ class TestFontEmbedding
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(document);
}
+
+ /**
+ * Test that an embedded and subsetted font can be reused.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testReuseEmbeddedSubsettedFont() throws IOException
+ {
+ String text1 = "The quick brown fox";
+ String text2 = "xof nworb kciuq ehT";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (PDDocument document = new PDDocument())
+ {
+ PDPage page = new PDPage();
+ document.addPage(page);
+ InputStream input = PDFont.class.getResourceAsStream(
+ "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+ PDType0Font font = PDType0Font.load(document, input);
+ try (PDPageContentStream stream = new PDPageContentStream(document, page))
+ {
+ stream.beginText();
+ stream.setFont(font, 20);
+ stream.newLineAtOffset(50, 600);
+ stream.showText(text1);
+ stream.endText();
+ }
+ document.save(baos);
+ }
+ // Append, while reusing the font subset
+ try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+ {
+ PDPage page = document.getPage(0);
+ PDFont font = page.getResources().getFont(COSName.getPDFName("F1"));
+ try (PDPageContentStream stream = new PDPageContentStream(document, page, AppendMode.APPEND, true))
+ {
+ stream.beginText();
+ stream.setFont(font, 20);
+ stream.newLineAtOffset(250, 600);
+ stream.showText(text2);
+ stream.endText();
+ }
+ baos.reset();
+ document.save(baos);
+ }
+ // Test that both texts are there
+ try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+ {
+ PDFTextStripper stripper = new PDFTextStripper();
+ String extractedText = stripper.getText(document);
+ assertEquals(text1 + " " + text2, extractedText.trim());
+ }
+ }
}