You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2009/03/09 13:46:41 UTC

svn commit: r751664 - in /incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox: encoding/conversion/ pdmodel/font/

Author: lehmi
Date: Mon Mar  9 12:46:41 2009
New Revision: 751664

URL: http://svn.apache.org/viewvc?rev=751664&view=rev
Log:
PDFBOX-420: Adding CJK-Support espacially for textextraction. Thanks to Pin Xue http://www.pinxue.net

Added:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java   (with props)
Modified:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java Mon Mar  9 12:46:41 2009
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.encoding.conversion;
+
+import org.fontbox.cmap.CMap;
+import java.io.UnsupportedEncodingException;
+
+
+/**
+ *  CJKConverter converts encodings defined in CJKEncodings
+ *
+ *  @auther pinxue <http://www.pinxue.net>, Holly Lee <holly.lee (at) gmail.com>
+ */
+class CJKConverter implements EncodingConverter
+{
+      /** The encoding */
+	  private String _encoding = null;
+	  /** The java charset name */
+	  private String _charset = null;
+
+
+      /**
+       *  Constructs a CJKConverter from a PDF encoding name
+       */
+      public CJKConverter(String encoding)
+      {
+             _encoding = encoding;
+			 _charset = CJKEncodings.getCharset(encoding);
+      }
+
+       /**
+        *  Convert a string. It occurs when a cmap lookup returned
+        *  converted bytes successfully, but we still need to convert its
+        *  encoding. The parameter s is constructs as one byte or a UTF-16BE
+        *  encoded string.
+        *
+        *  Note: pdfbox set string to UTF-16BE charset before calling into
+        *  this.
+        */
+       public String convertString(String s)
+       {
+              if ( s.length() == 1 )
+			  	 return s;
+
+              if ( _charset.equalsIgnoreCase("UTF-16BE") )
+			  	 return s;
+
+              try {
+			      return new String(s.getBytes("UTF-16BE"), _charset);
+			  }
+			  catch ( UnsupportedEncodingException uee ) {
+			      return s;
+			  }
+       }
+
+	   /**
+	    *  Convert bytes to a string. We just convert bytes within
+	    *  coderange defined in CMap.
+	    *
+	    *  @return Converted string.
+	    */
+	   public String convertBytes(byte [] c, int offset, int length, CMap cmap)
+	   {
+	          if ( cmap != null ) {
+
+                 try {
+                     if ( cmap.isInCodeSpaceRanges(c, offset, length) )
+		      	        return new String(c, offset, length, _charset);
+			      	 else
+			      	    return null;
+
+		  	     }
+			     catch ( UnsupportedEncodingException uee ) {
+			         return new String(c, offset, length);
+			     }
+
+		  	  }
+
+              // No cmap?
+			  return null;
+	   }
+
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKConverter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java Mon Mar  9 12:46:41 2009
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * This class represents PDF encoding name to Java charset name mapping
+ *
+ * @author  Pin Xue (http://www.pinxue.net), Holly Lee (holly.lee (at) gmail.com)
+ * @version $Revision: 1.0 $
+ */
+class CJKEncodings
+{
+   // Mapping: PDF encoding name -> Java (IANA) charset name
+   private static HashMap _mapping = new HashMap();
+
+   static
+   {
+       // Chinese (Simplified)
+       _mapping.put("GB-EUC-H",        "GB2312");              // Microsoft Code Page 936 (lfCharSet 0x86), GB 2312-80 character set, EUC-CN encoding
+       _mapping.put("GB-EUC-V",        "GB2312");              // Vertical version of GB-EUC-H
+       _mapping.put("GBpc-EUC-H",      "GB2312");              // Mac OS, GB 2312-80 character set, EUC-CN encoding, Script Manager code 19
+       _mapping.put("GBpc-EUC-V",      "GB2312");              // Vertical version of GBpc-EUC-H
+       _mapping.put("GBK-EUC-H",       "GBK");                 // Microsoft Code Page 936 (lfCharSet 0x86), GBK character set, GBK encoding
+       _mapping.put("GBK-EUC-V",       "GBK");                 // Vertical version of GBK-EUC-H
+       _mapping.put("GBKp-EUC-H",      "GBK");                 // Same as GBK-EUC-H but replaces half-width Latin characters with proportional forms and maps character code 0x24 to a dollar sign ($) instead of a yuan symbol (♀∴)
+       _mapping.put("GBKp-EUC-V",      "GBK");                 // Vertical version of GBKp-EUC-H
+       _mapping.put("GBK2K-H",         "GB18030");             // GB 18030-2000 character set, mixed 1-, 2-, and 4-byte encoding
+       _mapping.put("GBK2K-V",         "GB18030");             // Vertical version of GBK2K-H
+       _mapping.put("UniGB-UCS2-H",    "ISO-10646-UCS-2");     // Unicode (UCS-2) encoding for the Adobe-GB1 character collection
+       _mapping.put("UniGB-UCS2-V",    "ISO-10646-UCS-2");     // Vertical version of UniGB-UCS2-H
+       _mapping.put("UniGB-UTF16-H",   "UTF-16BE");            // Unicode (UTF-16BE) encoding for the Adobe-GB1 character collection; contains mappings for all characters in the GB18030-2000 character set
+       _mapping.put("UniGB-UTF16-V",   "UTF-16BE");            // Vertical version of UniGB-UTF16-H
+
+       // Chinese (Traditional)
+       _mapping.put("B5pc-H",  "BIG5");                    // Mac OS, Big Five character set, Big Five encoding, Script Manager code 2
+       _mapping.put("B5pc-V",  "BIG5");                    // Vertical version of B5pc-H
+       _mapping.put("HKscs-B5-H",      "Big5-HKSCS");          // Hong Kong SCS, an extension to the Big Five character set and encoding
+       _mapping.put("HKscs-B5-V",      "Big5-HKSCS");          // Vertical version of HKscs-B5-H
+       _mapping.put("ETen-B5-H",       "BIG5");                // Microsoft Code Page 950 (lfCharSet 0x88), Big Five character set with ETen extensions
+       _mapping.put("ETen-B5-V",       "BIG5");                // Vertical version of ETen-B5-H
+       _mapping.put("ETenms-B5-H",     "BIG5");                // Same as ETen-B5-H but replaces half-width Latin characters with proportional forms
+       _mapping.put("ETenms-B5-V",     "BIG5");                // Vertical version of ETenms-B5-H
+       _mapping.put("CNS-EUC-H",       "HZ");          // CNS 11643-1992 character set, EUC-TW encoding
+       _mapping.put("CNS-EUC-V",       "HZ");          // Vertical version of CNS-EUC-H
+       _mapping.put("UniCNS-UCS2-H",   "ISO-10646-UCS-2");             // Unicode (UCS-2) encoding for the Adobe-CNS1 character collection
+       _mapping.put("UniCNS-UCS2-V",   "ISO-10646-UCS-2");             // Vertical version of UniCNS-UCS2-H
+       _mapping.put("UniCNS-UTF16-H",  "UTF-16BE");            // Unicode (UTF-16BE) encoding for the Adobe-CNS1 character collection; contains mappings for all the characters in the HKSCS-2001 character set and contains both 2- and 4- byte character codes
+       _mapping.put("UniCNS-UTF16-V",  "UTF-16BE");            // Vertical version of UniCNS-UTF16-H
+
+       //Japanese
+       _mapping.put("83pv-RKSJ-H",     "JIS");                 // Mac OS, JIS X 0208 character set with KanjiTalk6 extensions, Shift-JIS encoding, Script Manager code 1
+       _mapping.put("90ms-RKSJ-H",     "JIS");                 // Microsoft Code Page 932 (lfCharSet 0x80), JIS X 0208 character set with NEC and IBM- extensions
+       _mapping.put("90ms-RKSJ-V",     "JIS");                 // Vertical version of 90ms-RKSJ-H
+       _mapping.put("90msp-RKSJ-H",    "JIS");                 // Same as 90ms-RKSJ-H but replaces half-width Latin characters with proportional forms
+       _mapping.put("90msp-RKSJ-V",    "JIS");                 // Vertical version of 90msp-RKSJ-H
+       _mapping.put("90pv-RKSJ-H",     "JIS");                 // Mac OS, JIS X 0208 character set with KanjiTalk7 extensions, Shift-JIS encoding, Script Manager code 1
+       _mapping.put("Add-RKSJ-H",      "JIS");                 // JIS X 0208 character set with Fujitsu FMR extensions, Shift-JIS encoding
+       _mapping.put("Add-RKSJ-V",      "JIS");                 // Vertical version of Add-RKSJ-H
+       _mapping.put("EUC-H",   "JIS");                    // JIS X 0208 character set, EUC-JP encoding
+       _mapping.put("EUC-V",   "JIS");                    // Vertical version of EUC-H
+       _mapping.put("Ext-RKSJ-H",      "JIS");                 // JIS C 6226 (JIS78) character set with NEC extensions, Shift-JIS encoding
+       _mapping.put("Ext-RKSJ-V",      "JIS");                 // Vertical version of Ext-RKSJ-H
+       _mapping.put("H",       "JIS");                    // JIS X 0208 character set, ISO-2022-JP encoding
+       _mapping.put("V",       "JIS");                    // Vertical version of H
+       _mapping.put("UniJIS-UCS2-H",   "ISO-10646-UCS-2");             // Unicode (UCS-2) encoding for the Adobe-Japan1 character collection
+       _mapping.put("UniJIS-UCS2-V",   "ISO-10646-UCS-2");             // Vertical version of UniJIS-UCS2-H
+       _mapping.put("UniJIS-UCS2-HW-H",        "ISO-10646-UCS-2");     // Same as UniJIS-UCS2-H but replaces proportional Latin characters with half-width forms
+       _mapping.put("UniJIS-UCS2-HW-V",        "ISO-10646-UCS-2");     // Vertical version of UniJIS-UCS2-HW-H
+       _mapping.put("UniJIS-UTF16-H",  "UTF-16BE");             // Unicode (UTF-16BE) encoding for the Adobe-Japan1 character collection; contains mappings for all characters in the JIS X 0213:1000 character set
+       _mapping.put("UniJIS-UTF16-V",  "UTF-16BE");            // Vertical version of UniJIS-UTF16-H
+       _mapping.put("Identity-H",       "JIS");                    // JIS X 0208 character set, ISO-2022-JP encoding
+       _mapping.put("Identity-V",       "JIS");                    // Vertical version of H
+
+       //Korean
+       _mapping.put("KSC-EUC-H",       "KSC");                 // KS X 1001:1992 character set, EUC-KR encoding
+       _mapping.put("KSC-EUC-V",       "KSC");                 // Vertical version of KSC-EUC-H
+       _mapping.put("KSCms-UHC-H",     "KSC");                 // Microsoft Code Page 949 (lfCharSet 0x81), KS X 1001:1992 character set plus 8822.putitional hangul, Unified Hangul Code (UHC) encoding
+       _mapping.put("KSCms-UHC-V",     "KSC");                 // Vertical version of KSCms-UHC-H
+       _mapping.put("KSCms-UHC-HW-H",  "KSC");                 // Same as KSCms-UHC-H but replaces proportional Latin characters with half-width forms
+       _mapping.put("KSCms-UHC-HW-V",  "KSC");                 // Vertical version of KSCms-UHC-HW-H
+       _mapping.put("KSCpc-EUC-H",     "KSC");                 // Mac OS, KS X 1001:1992 character set with Mac OS KH extensions, Script Manager Code 3
+       _mapping.put("UniKS-UCS2-H",    "ISO-10646-UCS-2");             // Unicode (UCS-2) encoding for the Adobe-Korea1 character collection
+       _mapping.put("UniKS-UCS2-V",    "ISO-10646-UCS-2");             // Vertical version of UniKS-UCS2-H
+       _mapping.put("UniKS-UTF16-H",   "UTF-16BE");            // Unicode (UTF-16BE) encoding for the Adobe-Korea1 character collection
+       _mapping.put("UniKS-UTF16-V",   "UTF-16BE");            // Vertical version of UniKS-UTF16-H
+   }
+
+
+   /**
+    *  Get respective Java charset name from given PDF encoding name.
+    *
+    *  @param encoding   PDF encoding name
+    *  @return Java charset name, or null if not found
+    */
+   public static final String getCharset( String encoding )
+   {
+       if ( encoding.startsWith("COSName"))
+           encoding = encoding.substring(8, encoding.length()-1);
+
+       return (String)(_mapping.get(encoding));
+   }
+
+   /**
+    *  Return an iterator to iterate through all encodings
+    */
+   public static final Iterator getEncodingIterator()
+   {
+          return _mapping.keySet().iterator();
+   }
+
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CJKEncoding.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java Mon Mar  9 12:46:41 2009
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.HashMap;
+
+/**
+ * This class provides a mapping from char code to unicode mapping files used for CJK-encoding
+ * @author Andreas Lehmkühler
+ *
+ */
+
+public class CMapSubstitution {
+
+	private static HashMap cmapSubstitutions = new HashMap();
+
+	static {
+		
+		// I don't know if these mappings are complete. Perhaps there 
+		// has to be added still one or more
+		
+		// chinese simplified
+		cmapSubstitutions.put( "Adobe-GB1-4", "Adobe-GB1-UCS2" );
+		cmapSubstitutions.put( "GBK-EUC-H", "GBK-EUC-UCS2" );
+		cmapSubstitutions.put( "GBK-EUC-V", "GBK-EUC-UCS2" );
+		cmapSubstitutions.put( "GBpc-EUC-H", "GBpc-EUC-UCS2C" );
+		cmapSubstitutions.put( "GBpc-EUC-V", "GBpc-EUC-UCS2C" );
+
+		// chinese traditional
+		cmapSubstitutions.put( "Adobe-CNS1-4", "Adobe-CNS1-UCS2" );
+		cmapSubstitutions.put( "B5pc-H", "B5pc-UCS2" );
+		cmapSubstitutions.put( "B5pc-V", "B5pc-UCS2" );
+		cmapSubstitutions.put( "ETen-B5-H", "ETen-B5-UCS2" );
+		cmapSubstitutions.put( "ETen-B5-V", "ETen-B5-UCS2" );
+		cmapSubstitutions.put( "ETenms-B5-H", "ETen-B5-UCS2" );
+		cmapSubstitutions.put( "ETenms-B5-V", "ETen-B5-UCS2" );
+
+		// japanese
+		cmapSubstitutions.put( "90ms-RKSJ-H", "90ms-RKSJ-UCS2" );
+		cmapSubstitutions.put( "90ms-RKSJ-V", "90ms-RKSJ-UCS2" );
+		cmapSubstitutions.put( "90msp-RKSJ-H", "90ms-RKSJ-UCS2" );
+		cmapSubstitutions.put( "90msp-RKSJ-V", "90ms-RKSJ-UCS2" );
+		cmapSubstitutions.put( "90pv-RKSJ-H", "90pv-RKSJ-UCS2");
+		cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" );
+		cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2");
+		cmapSubstitutions.put( "Identity-H", "Adobe-Japan1-UCS2");
+
+	}
+	
+	public static String substituteCMap(String cmapName) {
+		if (cmapSubstitutions.containsKey(cmapName))
+			return (String)cmapSubstitutions.get(cmapName);
+		return cmapName;
+	}
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java Mon Mar  9 12:46:41 2009
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import java.util.Iterator;
+import java.util.HashMap;
+
+/**
+ *  EncodingConversionManager maintains relationship between PDF encoding name
+ *  and respective EncodingConverter instance. Those PDF encoding name like
+ *  GBK-EUC-H should be converted to java charset name before constructing a
+ *  java string instance
+ */
+public class EncodingConversionManager
+{
+       /**
+        *  Mapping from PDF encoding name to EncodingConverter instance
+        */
+       private static HashMap _encodingMap = new HashMap();
+
+	   /**
+	    *  Initialize the encodingMap before anything calls us
+	    */
+       static {
+
+	       // Add CJK encodings to map
+	       Iterator it = CJKEncodings.getEncodingIterator();
+
+		   while ( it.hasNext() ) {
+		         String encodingName = (String)(it.next());
+		   	     _encodingMap.put(encodingName, new CJKConverter(encodingName));
+		   }
+
+		   // If there is any other encoding conversions, please add it here.
+
+   	   }
+
+	   /**
+	    *  Get converter from given encoding name. If no converted defined,
+	    *  a null is returned
+	    */
+	   public static final EncodingConverter getConverter(String encoding)
+	   {
+	          return (EncodingConverter)(_encodingMap.get(encoding));
+	   }
+
+
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConversionManager.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java?rev=751664&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java Mon Mar  9 12:46:41 2009
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.encoding.conversion;
+
+import org.fontbox.cmap.CMap;
+
+/**
+ *  EncodingConverter converts string or characters in one encoding, which is specified in PDF
+ *  file, to another string with respective java charset. The mapping from
+ *  PDF encoding name to java charset name is maintained by EncodingConversionManager
+ */
+public interface EncodingConverter
+{
+       /**
+        *  Convert a string
+        */
+       public String convertString(String s);
+
+	   /**
+	    *  Convert bytes to a string
+	    */
+	   public String convertBytes(byte [] c, int offset, int length, CMap cmap);
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/EncodingConverter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=751664&r1=751663&r2=751664&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Mon Mar  9 12:46:41 2009
@@ -17,17 +17,17 @@
 package org.apache.pdfbox.pdmodel.font;
 
 import org.fontbox.afm.AFMParser;
-
 import org.fontbox.afm.FontMetric;
-
 import org.fontbox.cmap.CMapParser;
-
 import org.fontbox.cmap.CMap;
+import org.apache.pdfbox.encoding.conversion.EncodingConversionManager;
+import org.apache.pdfbox.encoding.conversion.EncodingConverter;
 
 import org.apache.pdfbox.encoding.AFMEncoding;
 import org.apache.pdfbox.encoding.DictionaryEncoding;
 import org.apache.pdfbox.encoding.Encoding;
 import org.apache.pdfbox.encoding.EncodingManager;
+import org.apache.pdfbox.encoding.conversion.CMapSubstitution;
 
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
@@ -69,7 +69,7 @@
 public abstract class PDFont extends LoggingObject implements COSObjectable
 {
 
-    /**
+	/**
      * The cos dictionary for this font.
      */
     protected COSDictionary font;
@@ -226,7 +226,7 @@
      * @throws IOException If there is an error drawing the specific string.
      */
     public abstract void drawString( String string, Graphics g, float fontSize,
-        AffineTransform at, float x, float y ) throws IOException;
+    	AffineTransform at, float x, float y ) throws IOException;
 
     /**
      * Used for multibyte encodings.
@@ -387,6 +387,7 @@
                         else
                         {
                             String cmapName = encodingName.getName();
+                            cmapName = CMapSubstitution.substituteCMap( cmapName );
                             String resourceRoot = "Resources/cmap/";
                             String resourceName = resourceRoot + cmapName;
                             parseCmap( resourceRoot, ResourceLoader.loadResource( resourceName ), encodingName );
@@ -435,6 +436,22 @@
         {
             retval = cmap.lookup( c, offset, length );
         }
+        COSBase encoding_COS = font.getDictionaryObject(COSName.ENCODING);
+
+        if ( encoding_COS instanceof COSName ) {
+        	EncodingConverter converter = EncodingConversionManager.getConverter(((COSName)encoding_COS).getName());
+
+        	if ( converter != null ) {
+
+        		if ( retval != null )
+        			retval = converter.convertString(retval);
+        		  else
+        			retval = converter.convertBytes(c, offset, length, cmap);
+
+        		return retval;
+        	}
+
+        }
         //if we havn't found a value yet and
         //we are still on the first byte and
         //there is no cmap or the cmap does not have 2 byte mappings then try to encode
@@ -540,9 +557,9 @@
                 }
             }
             /**
-             * Si la clé /Encoding existe dans le dictionnaire fonte il y a deux possibilités :
-             * 1er cas : elle est associé à une reference contenant un dictionnaire de type encoding.
-             * Ce dictionnaire PDF est représenté par un DictionaryEncoding.
+             * Si la cl� /Encoding existe dans le dictionnaire fonte il y a deux possibilit�s :
+             * 1er cas : elle est associ� � une reference contenant un dictionnaire de type encoding.
+             * Ce dictionnaire PDF est repr�sent� par un DictionaryEncoding.
              * If the /Encoding Key does exist in the font dictionary, there are two cases :
              * case one : The value associated with /Encoding is a reference to a dictionary.
              * This dictionary is represented by an instance of DictionaryEncoding class
@@ -555,7 +572,7 @@
                 //file
                 COSName baseEncodingName = (COSName) encodingDic.getDictionaryObject(
                     COSName.BASE_ENCODING);
-                //on ajoute une entrée /BaseEncoding dans /Encoding uniquement si elle en est absente
+                //on ajoute une entr�e /BaseEncoding dans /Encoding uniquement si elle en est absente
                 //if not find in Encoding dictinary target, we try to find it from else where
                 if( baseEncodingName == null)
                 {
@@ -729,8 +746,8 @@
 
         COSName retvalue = null;
         //recuperer le programme de fonte dans son stream qui doit se trouver
-        //dans le flux référencé par à la clé FileFont lui même situé dans
-        //le dictionnaire associé à /FontDescriptor du dictionnaire de type /Font courrant
+        //dans le flux r�f�renc� par � la cl� FileFont lui m�me situ� dans
+        //le dictionnaire associ� � /FontDescriptor du dictionnaire de type /Font courrant
         //get the font program in the stream which should be located in
          //the /FileFont Stream object himself in the /FontDescriptior of the current
         //font dictionary