You are viewing a plain text version of this content. The canonical link for it is here.
Posted to j-users@xerces.apache.org by Christopher Sahnwaldt <jc...@gmx.net> on 2006/04/19 12:41:11 UTC
Re: Problem in Parsing xml with Korean Characters

This is (most likely) not a problem of the parser, but an
encoding problem, so I'm moving this to the users list.

You should avoid dealing with byte arrays and byte streams 
(InputStream, OutputStream) when you're processing XML. 
Let the parser figure out the file encoding (it reads the 
encoding="euc-kr" part), otherwise use Strings and character 
streams (Reader, Writer).

This should work:

public static void main(String[] args) throws TransformerException 
{
  Document doc = load(new File("trial1.xml"));
  System.out.println(toString(doc));
}

private static Document load( File file ) throws TransformerException
{
  Transformer copy = TransformerFactory.newInstance().newTransformer();
  Source source = new StreamSource(file);
  DOMResult result = new DOMResult();
  copy.transform(source, result);
  return (Document)result.getNode();
}

private static String toString( Document doc ) throws TransformerException
{
  Transformer copy = TransformerFactory.newInstance().newTransformer();
  Source source = new DOMSource(doc);
  Writer writer = new StringWriter();
  Result result = new StreamResult(writer);
  copy.transform(source, result);
  return writer.toString();
}


Here are two articles that explain some of the background:

http://www.joelonsoftware.com/articles/Unicode.html
http://www.jorendorff.com/articles/unicode/java.html

Hope that helps,
Christopher.

> --- Ursprüngliche Nachricht ---
> Von: Sereena <se...@mindtree.com>
> An: xerces-j-dev@xml.apache.org
> Betreff: Problem in Parsing xml with Korean Characters
> Datum: Tue, 18 Apr 2006 13:54:15 +0000 (UTC)
> 
> I am trying to parse an xml with Korean characters in it, but when some of
> the 
> korean characters are encountered, the parsing stops. If I remove the 
> characters causing problem, the rest of the xml is also parsed. Could
> anyone 
> help me to get this solved so that I can parse the whole xml with any
> korean 
> character in it?
> Please note that I am not getting any exception here, but the parsing
> stops.
> 
> The code would look like this : 
> 
> File data = new File("E://Folder1..//trial1.xml");
> 				int fileSize = (int) data.length();
> 				FileInputStream file = new FileInputStream
> (data);
> 				byte[] data2 = new byte[fileSize];
> 				
> 				
> 				for(int i=0; i < fileSize; i++ ) 
> 				{
> 						data2[i] = (byte) file.read();
> //						System.out.println(data2[i]);
> 				}
> 							
> 				file.close(); 
>                                 DocumentBuilderFactory dbf = 
> DocumentBuilderFactory.newInstance();
> 				DocumentBuilder db = dbf.newDocumentBuilder();
>                                 doc = db.parse(new InputSource(new 
> ByteArrayInputStream(data2)));
> 
> //The following is to get the document in string format
>                                 System.out.println("Reconverting");		
> 				byte [] removeResult=document2bytes
> (doc.getDocumentElement());
>                                 String result = new String(removeResult);
> 				System.out.println("Result =" + result);
> 
>                                
> System.out.println(encodingString("utf-8","iso-
> 8859-1",result));
> 
> 
> public static byte[] document2bytes(Node node) {
> 				try {
> 					Source source = new DOMSource(node);
>                				ByteArrayOutputStream out = new 
> ByteArrayOutputStream();
> 					StringWriter stringWriter = new 
> StringWriter();
> 					Result result = new StreamResult(out);
> 					TransformerFactory factory = 
> TransformerFactory.newInstance();
> 					Transformer transformer = 
> factory.newTransformer();
> 					transformer.transform(source, result);
> 					return out.toByteArray();
> 				} catch (TransformerConfigurationException e) {
> 					e.printStackTrace();
> 				} catch (TransformerException e) {
> 					e.printStackTrace();
> 				}
> 				return null;
> 	}
> 
> public static String encodingString(String fromEnc, String toEnc, String
> value)
> 			throws IOException {
> 			if (value != null) {
> 				if ("iso-8859-1".equals(toEnc)) {
> 					System.out.println("[encodeString] 
> value from static table cell element " + value);
> 					value = new String(value.getBytes
> (), "UTF-8");
> 					System.out.println("[encodeString] 
> Before encoding " + value);
> 					value = escapingNCR(value, false);
> 					System.out.println(" [encodeString] 
> After encoding NCR " + value);
> 				}
> 				else {
> 					System.out.println("[encodeString] 
> Before encoding " + value);
> 					ByteArrayInputStream bis = new 
> ByteArrayInputStream(value.getBytes());
> 					ByteArrayOutputStream bos = new 
> ByteArrayOutputStream();
> 					// Set up character stream
> 					Reader r = new BufferedReader(new 
> InputStreamReader(bis, fromEnc));
> 					Writer w = new BufferedWriter(new 
> OutputStreamWriter(bos, toEnc));
> 				
> 					char[] buffer = new char[4096];
> 					int len;
> 					while ((len = r.read(buffer)) != -1)
> 						w.write(buffer, 0, len);
> 					r.close();
> 					w.flush();
> 					w.close();
> 					value = bos.toString();
> 					System.out.println("[encodeString] 
> After encoding " + value);
> 				}
> 			}
> 			return value;
> 		}
> 
> 
> public static String escapingNCR(String str, boolean escapeAscii) 
> 		{
> 		   String ostr = new String();
> 
> 		   for(int i=0; i<str.length(); i++) {
> 
> 			  char ch = str.charAt(i);
> 			  //System.out.println(new String(new char[]{ch})); 	
> 	
> 			  if (!escapeAscii && ((ch >= 0x0020) && (ch <= 
> 0x007e)) || specialSaveChars.indexOf(ch) >= 0) {
> 				ostr += ch ;
> 			  }else {
> 				ostr += "&#x" ;
> 				String hex = Integer.toHexString(str.charAt(i) 
> & 0xFFFF);
> 				if (hex.length() == 2) {
> 					ostr += "00" ;
> 				}
> 				ostr += hex.toUpperCase(Locale.ENGLISH);
> 				ostr += ";";
> 			  }
> 		   }
> 
> 		   return (ostr);
> 		}
> 
> 
> The xml 'trial1.xml' that I parse could look like this:
> 
> ---------------------------------------------------
> <?xml version="1.0" encoding="euc-kr"?>
> <TrialXML>ÃÃÂ·Â©ÃÂ¬Â¸Â° Â¾Ã®Â±ÃÂ·Â¹Â½ÃÂºÃª Â±ÃÂ·ÃÂ½Âº </TrialXML>
> ---------------------------------------------------
> 
> 
> 
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: j-dev-unsubscribe@xerces.apache.org
> For additional commands, e-mail: j-dev-help@xerces.apache.org
> 



---------------------------------------------------------------------
To unsubscribe, e-mail: j-users-unsubscribe@xerces.apache.org
For additional commands, e-mail: j-users-help@xerces.apache.org