You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "John Hewson (JIRA)" <ji...@apache.org> on 2015/11/25 17:54:11 UTC

[jira] [Closed] (PDFBOX-3132) Cannot extract text which font is Type0 with predefined CJK CMap

     [ https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

John Hewson closed PDFBOX-3132.
-------------------------------
    Resolution: Won't Fix

We don't support this in 1.8. Use 2.0 instead.

> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
>                 Key: PDFBOX-3132
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3132
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: PDModel
>    Affects Versions: 1.8.9
>            Reporter: Raymond Wu
>         Attachments: pdf_font-zhcn.pdf
>
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
>     @Override
>     public String encode(byte[] c, int offset, int length) throws IOException
>     {
>         String retval = null;
>         if (hasToUnicode())
>         {
>             retval = super.encode(c, offset, length);
>         }
>         
>         if (retval == null)
>         {
>             int result = cmap.lookupCID(c, offset, length);
>             if (result != -1)
>             {
>                 retval = descendantFont.cmapEncoding(result, 2, true, null);
>             } else {
>             	// Predefined CJK CMap
>             	//
>             	// PDF Source:
>             	// 20 0 obj
>             	// <<
>             	// /Type /Font
>             	// /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
>             	// /Subtype /Type0
>             	// /Encoding /UniGB-UTF16-H
>             	// /DescendantFonts [42 0 R]
>             	// >>
>             	// endobj
>             	//
>             	
>             	COSBase encoding = getEncoding();
>             	if (length == 2 && encoding instanceof COSName)
>             	{
>         			String encname = ((COSName)encoding).getName();
>         			String charset = charsetOfPredefinedCJKCMap(encname);
>         			if (charset!=null) {
>         				retval = new String(c, offset, length, charset);
>         			}
>             	}
>             }
>         }
>         
>         return retval;
>     }
>     /**
>      * Predefined CJK CMap name to Java charset name
>      * 
>      * @author Raymond Wu <ra...@softnext.com.tw>
>      * @param  encname Predefined CJK CMap name
>      * @return Java charset name
>      */
>     public String charsetOfPredefinedCJKCMap(String encname) {
> 		// PDF 32000-1:2008 Page 274
> 		// Table 118 – Predefined CJK CMap names
> 		//
> 		// @See http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
> 		// @See https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
> 		// Unicode
> 		if (encname.contains("UTF16")) return "UTF-16BE";
> 		if (encname.contains("UCS2"))  return "UTF-16BE";
> 		
> 		// Chinese (Traditional)
> 		// @See https://zh.wikipedia.org/wiki/巴別塔
> 		if (encname.startsWith("B5pc-"))   return "BIG5";
> 		if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
> 		if (encname.startsWith("ETen-"))   return "MS950";
> 		if (encname.startsWith("ETenms-")) return "MS950";
> 		if (encname.startsWith("CNS-"))    return "EUC-TW";
> 		// Chinese (Simplified)
> 		if (encname.startsWith("GB-"))    return "MS936";
> 		if (encname.startsWith("GBpc-"))  return "GB2312";
> 		if (encname.startsWith("GBK-"))   return "MS936";
> 		if (encname.startsWith("GBKp-"))  return "MS936";
> 		if (encname.startsWith("GBK2K-")) return "GB18030";
> 		// Japanese
> 		if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // JIS X 0208 + KanjiTalk6 (漢字6)
> 		if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // MS932
> 		if (encname.startsWith("90msp-")) return "JISAutoDetect"; // MS932
> 		if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // JIS X 0208 + KanjiTalk7 (漢字7)
> 		if (encname.startsWith("Add-"))   return "JISAutoDetect"; // JIS X 0208 + Fujitsu FMR
> 		if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // JIS X 0208
> 		if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // JIS C 6226 + NEC
> 		if (encname.equals("H"))          return "JISAutoDetect"; // ISO-2022-JP
> 		if (encname.equals("V"))          return "JISAutoDetect"; // ISO-2022-JP
> 		// Korean
> 		if (encname.startsWith("KSC-"))   return "EUC_KR";
> 		if (encname.startsWith("KSCms-")) return "MS949";
> 		if (encname.startsWith("KSCpc-")) return "EUC_KR";
>     	return null;
>     }
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org