You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "John Hewson (JIRA)" <ji...@apache.org> on 2015/11/25 17:54:11 UTC
[jira] [Closed] (PDFBOX-3132) Cannot extract text which font is
Type0 with predefined CJK CMap
[ https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
John Hewson closed PDFBOX-3132.
-------------------------------
Resolution: Won't Fix
We don't support this in 1.8. Use 2.0 instead.
> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
> Key: PDFBOX-3132
> URL: https://issues.apache.org/jira/browse/PDFBOX-3132
> Project: PDFBox
> Issue Type: Improvement
> Components: PDModel
> Affects Versions: 1.8.9
> Reporter: Raymond Wu
> Attachments: pdf_font-zhcn.pdf
>
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
> @Override
> public String encode(byte[] c, int offset, int length) throws IOException
> {
> String retval = null;
> if (hasToUnicode())
> {
> retval = super.encode(c, offset, length);
> }
>
> if (retval == null)
> {
> int result = cmap.lookupCID(c, offset, length);
> if (result != -1)
> {
> retval = descendantFont.cmapEncoding(result, 2, true, null);
> } else {
> // Predefined CJK CMap
> //
> // PDF Source:
> // 20 0 obj
> // <<
> // /Type /Font
> // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> // /Subtype /Type0
> // /Encoding /UniGB-UTF16-H
> // /DescendantFonts [42 0 R]
> // >>
> // endobj
> //
>
> COSBase encoding = getEncoding();
> if (length == 2 && encoding instanceof COSName)
> {
> String encname = ((COSName)encoding).getName();
> String charset = charsetOfPredefinedCJKCMap(encname);
> if (charset!=null) {
> retval = new String(c, offset, length, charset);
> }
> }
> }
> }
>
> return retval;
> }
> /**
> * Predefined CJK CMap name to Java charset name
> *
> * @author Raymond Wu <ra...@softnext.com.tw>
> * @param encname Predefined CJK CMap name
> * @return Java charset name
> */
> public String charsetOfPredefinedCJKCMap(String encname) {
> // PDF 32000-1:2008 Page 274
> // Table 118 – Predefined CJK CMap names
> //
> // @See http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
> // @See https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
> // Unicode
> if (encname.contains("UTF16")) return "UTF-16BE";
> if (encname.contains("UCS2")) return "UTF-16BE";
>
> // Chinese (Traditional)
> // @See https://zh.wikipedia.org/wiki/巴別塔
> if (encname.startsWith("B5pc-")) return "BIG5";
> if (encname.startsWith("HKscs-")) return "MS950_HKSCS";
> if (encname.startsWith("ETen-")) return "MS950";
> if (encname.startsWith("ETenms-")) return "MS950";
> if (encname.startsWith("CNS-")) return "EUC-TW";
> // Chinese (Simplified)
> if (encname.startsWith("GB-")) return "MS936";
> if (encname.startsWith("GBpc-")) return "GB2312";
> if (encname.startsWith("GBK-")) return "MS936";
> if (encname.startsWith("GBKp-")) return "MS936";
> if (encname.startsWith("GBK2K-")) return "GB18030";
> // Japanese
> if (encname.startsWith("83pv-")) return "JISAutoDetect"; // JIS X 0208 + KanjiTalk6 (漢字6)
> if (encname.startsWith("90ms-")) return "JISAutoDetect"; // MS932
> if (encname.startsWith("90msp-")) return "JISAutoDetect"; // MS932
> if (encname.startsWith("90pv-")) return "JISAutoDetect"; // JIS X 0208 + KanjiTalk7 (漢字7)
> if (encname.startsWith("Add-")) return "JISAutoDetect"; // JIS X 0208 + Fujitsu FMR
> if (encname.startsWith("EUC-")) return "JISAutoDetect"; // JIS X 0208
> if (encname.startsWith("Ext-")) return "JISAutoDetect"; // JIS C 6226 + NEC
> if (encname.equals("H")) return "JISAutoDetect"; // ISO-2022-JP
> if (encname.equals("V")) return "JISAutoDetect"; // ISO-2022-JP
> // Korean
> if (encname.startsWith("KSC-")) return "EUC_KR";
> if (encname.startsWith("KSCms-")) return "MS949";
> if (encname.startsWith("KSCpc-")) return "EUC_KR";
> return null;
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org