You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2009/03/09 13:46:41 UTC
svn commit: r751664 - in
/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox:
encoding/conversion/ pdmodel/font/
Author: lehmi
Date: Mon Mar 9 12:46:41 2009
New Revision: 751664
URL: http://svn.apache.org/viewvc?rev=751664&view=rev
Log:
PDFBOX-420: Adding CJK-Support espacially for textextraction. Thanks to Pin Xue http://www.pinxue.net
Added:
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java (with props)
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java (with props)
Modified:
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java Mon Mar 9 12:46:41 2009
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.encoding.conversion;
+
+import org.fontbox.cmap.CMap;
+import java.io.UnsupportedEncodingException;
+
+
+/**
+ * CJKConverter converts encodings defined in CJKEncodings
+ *
+ * @auther pinxue <http://www.pinxue.net>, Holly Lee <holly.lee (at) gmail.com>
+ */
+class CJKConverter implements EncodingConverter
+{
+ /** The encoding */
+ private String _encoding = null;
+ /** The java charset name */
+ private String _charset = null;
+
+
+ /**
+ * Constructs a CJKConverter from a PDF encoding name
+ */
+ public CJKConverter(String encoding)
+ {
+ _encoding = encoding;
+ _charset = CJKEncodings.getCharset(encoding);
+ }
+
+ /**
+ * Convert a string. It occurs when a cmap lookup returned
+ * converted bytes successfully, but we still need to convert its
+ * encoding. The parameter s is constructs as one byte or a UTF-16BE
+ * encoded string.
+ *
+ * Note: pdfbox set string to UTF-16BE charset before calling into
+ * this.
+ */
+ public String convertString(String s)
+ {
+ if ( s.length() == 1 )
+ return s;
+
+ if ( _charset.equalsIgnoreCase("UTF-16BE") )
+ return s;
+
+ try {
+ return new String(s.getBytes("UTF-16BE"), _charset);
+ }
+ catch ( UnsupportedEncodingException uee ) {
+ return s;
+ }
+ }
+
+ /**
+ * Convert bytes to a string. We just convert bytes within
+ * coderange defined in CMap.
+ *
+ * @return Converted string.
+ */
+ public String convertBytes(byte [] c, int offset, int length, CMap cmap)
+ {
+ if ( cmap != null ) {
+
+ try {
+ if ( cmap.isInCodeSpaceRanges(c, offset, length) )
+ return new String(c, offset, length, _charset);
+ else
+ return null;
+
+ }
+ catch ( UnsupportedEncodingException uee ) {
+ return new String(c, offset, length);
+ }
+
+ }
+
+ // No cmap?
+ return null;
+ }
+
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java Mon Mar 9 12:46:41 2009
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * This class represents PDF encoding name to Java charset name mapping
+ *
+ * @author Pin Xue (http://www.pinxue.net), Holly Lee (holly.lee (at) gmail.com)
+ * @version $Revision: 1.0 $
+ */
+class CJKEncodings
+{
+ // Mapping: PDF encoding name -> Java (IANA) charset name
+ private static HashMap _mapping = new HashMap();
+
+ static
+ {
+ // Chinese (Simplified)
+ _mapping.put("GB-EUC-H", "GB2312"); // Microsoft Code Page 936 (lfCharSet 0x86), GB 2312-80 character set, EUC-CN encoding
+ _mapping.put("GB-EUC-V", "GB2312"); // Vertical version of GB-EUC-H
+ _mapping.put("GBpc-EUC-H", "GB2312"); // Mac OS, GB 2312-80 character set, EUC-CN encoding, Script Manager code 19
+ _mapping.put("GBpc-EUC-V", "GB2312"); // Vertical version of GBpc-EUC-H
+ _mapping.put("GBK-EUC-H", "GBK"); // Microsoft Code Page 936 (lfCharSet 0x86), GBK character set, GBK encoding
+ _mapping.put("GBK-EUC-V", "GBK"); // Vertical version of GBK-EUC-H
+ _mapping.put("GBKp-EUC-H", "GBK"); // Same as GBK-EUC-H but replaces half-width Latin characters with proportional forms and maps character code 0x24 to a dollar sign ($) instead of a yuan symbol (âÂÂâ´)
+ _mapping.put("GBKp-EUC-V", "GBK"); // Vertical version of GBKp-EUC-H
+ _mapping.put("GBK2K-H", "GB18030"); // GB 18030-2000 character set, mixed 1-, 2-, and 4-byte encoding
+ _mapping.put("GBK2K-V", "GB18030"); // Vertical version of GBK2K-H
+ _mapping.put("UniGB-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-GB1 character collection
+ _mapping.put("UniGB-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniGB-UCS2-H
+ _mapping.put("UniGB-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-GB1 character collection; contains mappings for all characters in the GB18030-2000 character set
+ _mapping.put("UniGB-UTF16-V", "UTF-16BE"); // Vertical version of UniGB-UTF16-H
+
+ // Chinese (Traditional)
+ _mapping.put("B5pc-H", "BIG5"); // Mac OS, Big Five character set, Big Five encoding, Script Manager code 2
+ _mapping.put("B5pc-V", "BIG5"); // Vertical version of B5pc-H
+ _mapping.put("HKscs-B5-H", "Big5-HKSCS"); // Hong Kong SCS, an extension to the Big Five character set and encoding
+ _mapping.put("HKscs-B5-V", "Big5-HKSCS"); // Vertical version of HKscs-B5-H
+ _mapping.put("ETen-B5-H", "BIG5"); // Microsoft Code Page 950 (lfCharSet 0x88), Big Five character set with ETen extensions
+ _mapping.put("ETen-B5-V", "BIG5"); // Vertical version of ETen-B5-H
+ _mapping.put("ETenms-B5-H", "BIG5"); // Same as ETen-B5-H but replaces half-width Latin characters with proportional forms
+ _mapping.put("ETenms-B5-V", "BIG5"); // Vertical version of ETenms-B5-H
+ _mapping.put("CNS-EUC-H", "HZ"); // CNS 11643-1992 character set, EUC-TW encoding
+ _mapping.put("CNS-EUC-V", "HZ"); // Vertical version of CNS-EUC-H
+ _mapping.put("UniCNS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-CNS1 character collection
+ _mapping.put("UniCNS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniCNS-UCS2-H
+ _mapping.put("UniCNS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-CNS1 character collection; contains mappings for all the characters in the HKSCS-2001 character set and contains both 2- and 4- byte character codes
+ _mapping.put("UniCNS-UTF16-V", "UTF-16BE"); // Vertical version of UniCNS-UTF16-H
+
+ //Japanese
+ _mapping.put("83pv-RKSJ-H", "JIS"); // Mac OS, JIS X 0208 character set with KanjiTalk6 extensions, Shift-JIS encoding, Script Manager code 1
+ _mapping.put("90ms-RKSJ-H", "JIS"); // Microsoft Code Page 932 (lfCharSet 0x80), JIS X 0208 character set with NEC and IBM- extensions
+ _mapping.put("90ms-RKSJ-V", "JIS"); // Vertical version of 90ms-RKSJ-H
+ _mapping.put("90msp-RKSJ-H", "JIS"); // Same as 90ms-RKSJ-H but replaces half-width Latin characters with proportional forms
+ _mapping.put("90msp-RKSJ-V", "JIS"); // Vertical version of 90msp-RKSJ-H
+ _mapping.put("90pv-RKSJ-H", "JIS"); // Mac OS, JIS X 0208 character set with KanjiTalk7 extensions, Shift-JIS encoding, Script Manager code 1
+ _mapping.put("Add-RKSJ-H", "JIS"); // JIS X 0208 character set with Fujitsu FMR extensions, Shift-JIS encoding
+ _mapping.put("Add-RKSJ-V", "JIS"); // Vertical version of Add-RKSJ-H
+ _mapping.put("EUC-H", "JIS"); // JIS X 0208 character set, EUC-JP encoding
+ _mapping.put("EUC-V", "JIS"); // Vertical version of EUC-H
+ _mapping.put("Ext-RKSJ-H", "JIS"); // JIS C 6226 (JIS78) character set with NEC extensions, Shift-JIS encoding
+ _mapping.put("Ext-RKSJ-V", "JIS"); // Vertical version of Ext-RKSJ-H
+ _mapping.put("H", "JIS"); // JIS X 0208 character set, ISO-2022-JP encoding
+ _mapping.put("V", "JIS"); // Vertical version of H
+ _mapping.put("UniJIS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-Japan1 character collection
+ _mapping.put("UniJIS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-H
+ _mapping.put("UniJIS-UCS2-HW-H", "ISO-10646-UCS-2"); // Same as UniJIS-UCS2-H but replaces proportional Latin characters with half-width forms
+ _mapping.put("UniJIS-UCS2-HW-V", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-HW-H
+ _mapping.put("UniJIS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-Japan1 character collection; contains mappings for all characters in the JIS X 0213:1000 character set
+ _mapping.put("UniJIS-UTF16-V", "UTF-16BE"); // Vertical version of UniJIS-UTF16-H
+ _mapping.put("Identity-H", "JIS"); // JIS X 0208 character set, ISO-2022-JP encoding
+ _mapping.put("Identity-V", "JIS"); // Vertical version of H
+
+ //Korean
+ _mapping.put("KSC-EUC-H", "KSC"); // KS X 1001:1992 character set, EUC-KR encoding
+ _mapping.put("KSC-EUC-V", "KSC"); // Vertical version of KSC-EUC-H
+ _mapping.put("KSCms-UHC-H", "KSC"); // Microsoft Code Page 949 (lfCharSet 0x81), KS X 1001:1992 character set plus 8822.putitional hangul, Unified Hangul Code (UHC) encoding
+ _mapping.put("KSCms-UHC-V", "KSC"); // Vertical version of KSCms-UHC-H
+ _mapping.put("KSCms-UHC-HW-H", "KSC"); // Same as KSCms-UHC-H but replaces proportional Latin characters with half-width forms
+ _mapping.put("KSCms-UHC-HW-V", "KSC"); // Vertical version of KSCms-UHC-HW-H
+ _mapping.put("KSCpc-EUC-H", "KSC"); // Mac OS, KS X 1001:1992 character set with Mac OS KH extensions, Script Manager Code 3
+ _mapping.put("UniKS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-Korea1 character collection
+ _mapping.put("UniKS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniKS-UCS2-H
+ _mapping.put("UniKS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-Korea1 character collection
+ _mapping.put("UniKS-UTF16-V", "UTF-16BE"); // Vertical version of UniKS-UTF16-H
+ }
+
+
+ /**
+ * Get respective Java charset name from given PDF encoding name.
+ *
+ * @param encoding PDF encoding name
+ * @return Java charset name, or null if not found
+ */
+ public static final String getCharset( String encoding )
+ {
+ if ( encoding.startsWith("COSName"))
+ encoding = encoding.substring(8, encoding.length()-1);
+
+ return (String)(_mapping.get(encoding));
+ }
+
+ /**
+ * Return an iterator to iterate through all encodings
+ */
+ public static final Iterator getEncodingIterator()
+ {
+ return _mapping.keySet().iterator();
+ }
+
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java Mon Mar 9 12:46:41 2009
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.HashMap;
+
+/**
+ * This class provides a mapping from char code to unicode mapping files used for CJK-encoding
+ * @author Andreas Lehmkühler
+ *
+ */
+
+public class CMapSubstitution {
+
+ private static HashMap cmapSubstitutions = new HashMap();
+
+ static {
+
+ // I don't know if these mappings are complete. Perhaps there
+ // has to be added still one or more
+
+ // chinese simplified
+ cmapSubstitutions.put( "Adobe-GB1-4", "Adobe-GB1-UCS2" );
+ cmapSubstitutions.put( "GBK-EUC-H", "GBK-EUC-UCS2" );
+ cmapSubstitutions.put( "GBK-EUC-V", "GBK-EUC-UCS2" );
+ cmapSubstitutions.put( "GBpc-EUC-H", "GBpc-EUC-UCS2C" );
+ cmapSubstitutions.put( "GBpc-EUC-V", "GBpc-EUC-UCS2C" );
+
+ // chinese traditional
+ cmapSubstitutions.put( "Adobe-CNS1-4", "Adobe-CNS1-UCS2" );
+ cmapSubstitutions.put( "B5pc-H", "B5pc-UCS2" );
+ cmapSubstitutions.put( "B5pc-V", "B5pc-UCS2" );
+ cmapSubstitutions.put( "ETen-B5-H", "ETen-B5-UCS2" );
+ cmapSubstitutions.put( "ETen-B5-V", "ETen-B5-UCS2" );
+ cmapSubstitutions.put( "ETenms-B5-H", "ETen-B5-UCS2" );
+ cmapSubstitutions.put( "ETenms-B5-V", "ETen-B5-UCS2" );
+
+ // japanese
+ cmapSubstitutions.put( "90ms-RKSJ-H", "90ms-RKSJ-UCS2" );
+ cmapSubstitutions.put( "90ms-RKSJ-V", "90ms-RKSJ-UCS2" );
+ cmapSubstitutions.put( "90msp-RKSJ-H", "90ms-RKSJ-UCS2" );
+ cmapSubstitutions.put( "90msp-RKSJ-V", "90ms-RKSJ-UCS2" );
+ cmapSubstitutions.put( "90pv-RKSJ-H", "90pv-RKSJ-UCS2");
+ cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" );
+ cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2");
+ cmapSubstitutions.put( "Identity-H", "Adobe-Japan1-UCS2");
+
+ }
+
+ public static String substituteCMap(String cmapName) {
+ if (cmapSubstitutions.containsKey(cmapName))
+ return (String)cmapSubstitutions.get(cmapName);
+ return cmapName;
+ }
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java Mon Mar 9 12:46:41 2009
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.Iterator;
+import java.util.HashMap;
+
+/**
+ * EncodingConversionManager maintains relationship between PDF encoding name
+ * and respective EncodingConverter instance. Those PDF encoding name like
+ * GBK-EUC-H should be converted to java charset name before constructing a
+ * java string instance
+ */
+public class EncodingConversionManager
+{
+ /**
+ * Mapping from PDF encoding name to EncodingConverter instance
+ */
+ private static HashMap _encodingMap = new HashMap();
+
+ /**
+ * Initialize the encodingMap before anything calls us
+ */
+ static {
+
+ // Add CJK encodings to map
+ Iterator it = CJKEncodings.getEncodingIterator();
+
+ while ( it.hasNext() ) {
+ String encodingName = (String)(it.next());
+ _encodingMap.put(encodingName, new CJKConverter(encodingName));
+ }
+
+ // If there is any other encoding conversions, please add it here.
+
+ }
+
+ /**
+ * Get converter from given encoding name. If no converted defined,
+ * a null is returned
+ */
+ public static final EncodingConverter getConverter(String encoding)
+ {
+ return (EncodingConverter)(_encodingMap.get(encoding));
+ }
+
+
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java Mon Mar 9 12:46:41 2009
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import org.fontbox.cmap.CMap;
+
+/**
+ * EncodingConverter converts string or characters in one encoding, which is specified in PDF
+ * file, to another string with respective java charset. The mapping from
+ * PDF encoding name to java charset name is maintained by EncodingConversionManager
+ */
+public interface EncodingConverter
+{
+ /**
+ * Convert a string
+ */
+ public String convertString(String s);
+
+ /**
+ * Convert bytes to a string
+ */
+ public String convertBytes(byte [] c, int offset, int length, CMap cmap);
+}
Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=751664&r1=751663&r2=751664&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Mon Mar 9 12:46:41 2009
@@ -17,17 +17,17 @@
package org.apache.pdfbox.pdmodel.font;
import org.fontbox.afm.AFMParser;
-
import org.fontbox.afm.FontMetric;
-
import org.fontbox.cmap.CMapParser;
-
import org.fontbox.cmap.CMap;
+import org.apache.pdfbox.encoding.conversion.EncodingConversionManager;
+import org.apache.pdfbox.encoding.conversion.EncodingConverter;
import org.apache.pdfbox.encoding.AFMEncoding;
import org.apache.pdfbox.encoding.DictionaryEncoding;
import org.apache.pdfbox.encoding.Encoding;
import org.apache.pdfbox.encoding.EncodingManager;
+import org.apache.pdfbox.encoding.conversion.CMapSubstitution;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
@@ -69,7 +69,7 @@
public abstract class PDFont extends LoggingObject implements COSObjectable
{
- /**
+ /**
* The cos dictionary for this font.
*/
protected COSDictionary font;
@@ -226,7 +226,7 @@
* @throws IOException If there is an error drawing the specific string.
*/
public abstract void drawString( String string, Graphics g, float fontSize,
- AffineTransform at, float x, float y ) throws IOException;
+ AffineTransform at, float x, float y ) throws IOException;
/**
* Used for multibyte encodings.
@@ -387,6 +387,7 @@
else
{
String cmapName = encodingName.getName();
+ cmapName = CMapSubstitution.substituteCMap( cmapName );
String resourceRoot = "Resources/cmap/";
String resourceName = resourceRoot + cmapName;
parseCmap( resourceRoot, ResourceLoader.loadResource( resourceName ), encodingName );
@@ -435,6 +436,22 @@
{
retval = cmap.lookup( c, offset, length );
}
+ COSBase encoding_COS = font.getDictionaryObject(COSName.ENCODING);
+
+ if ( encoding_COS instanceof COSName ) {
+ EncodingConverter converter = EncodingConversionManager.getConverter(((COSName)encoding_COS).getName());
+
+ if ( converter != null ) {
+
+ if ( retval != null )
+ retval = converter.convertString(retval);
+ else
+ retval = converter.convertBytes(c, offset, length, cmap);
+
+ return retval;
+ }
+
+ }
//if we havn't found a value yet and
//we are still on the first byte and
//there is no cmap or the cmap does not have 2 byte mappings then try to encode
@@ -540,9 +557,9 @@
}
}
/**
- * Si la clé /Encoding existe dans le dictionnaire fonte il y a deux possibilités :
- * 1er cas : elle est associé à une reference contenant un dictionnaire de type encoding.
- * Ce dictionnaire PDF est représenté par un DictionaryEncoding.
+ * Si la cl� /Encoding existe dans le dictionnaire fonte il y a deux possibilit�s :
+ * 1er cas : elle est associ� � une reference contenant un dictionnaire de type encoding.
+ * Ce dictionnaire PDF est repr�sent� par un DictionaryEncoding.
* If the /Encoding Key does exist in the font dictionary, there are two cases :
* case one : The value associated with /Encoding is a reference to a dictionary.
* This dictionary is represented by an instance of DictionaryEncoding class
@@ -555,7 +572,7 @@
//file
COSName baseEncodingName = (COSName) encodingDic.getDictionaryObject(
COSName.BASE_ENCODING);
- //on ajoute une entrée /BaseEncoding dans /Encoding uniquement si elle en est absente
+ //on ajoute une entr�e /BaseEncoding dans /Encoding uniquement si elle en est absente
//if not find in Encoding dictinary target, we try to find it from else where
if( baseEncodingName == null)
{
@@ -729,8 +746,8 @@
COSName retvalue = null;
//recuperer le programme de fonte dans son stream qui doit se trouver
- //dans le flux référencé par à la clé FileFont lui même situé dans
- //le dictionnaire associé à /FontDescriptor du dictionnaire de type /Font courrant
+ //dans le flux r�f�renc� par � la cl� FileFont lui m�me situ� dans
+ //le dictionnaire associ� � /FontDescriptor du dictionnaire de type /Font courrant
//get the font program in the stream which should be located in
//the /FileFont Stream object himself in the /FontDescriptior of the current
//font dictionary