You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@openoffice.apache.org by hd...@apache.org on 2012/08/08 11:31:48 UTC

svn commit: r1370716 - /incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx

Author: hdu
Date: Wed Aug  8 09:31:47 2012
New Revision: 1370716

URL: http://svn.apache.org/viewvc?rev=1370716&view=rev
Log:
#i120442# fix html-flavored copy+paste of unicode surrogate pairs

Patch-by: Chen Peng
Found-by: Yan Ji
Review-by and minor modifications: Herbert Duerr

Modified:
    incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx

Modified: incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx
URL: http://svn.apache.org/viewvc/incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx?rev=1370716&r1=1370715&r2=1370716&view=diff
==============================================================================
--- incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx (original)
+++ incubator/ooo/trunk/main/sw/source/filter/html/htmlatr.cxx Wed Aug  8 09:31:47 2012
@@ -2628,9 +2628,22 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt,
 
 			if( bOutChar )
 			{
-				sal_Unicode c = rStr.GetChar( nStrPos );
-				// versuche nach ungefaehr 255 Zeichen eine neue Zeile zu
-				// beginnen, aber nicht in PRE und nur bei Spaces
+				// #i120442#: get the UTF-32 codepoint by converting an eventual UTF-16 unicode surrogate pair
+				sal_uInt64 c = rStr.GetChar( nStrPos );
+				if( nStrPos < nEnde - 1 )
+				{
+					const sal_Unicode d = rStr.GetChar( nStrPos + 1 );
+					if( (c >= 0xd800 && c <= 0xdbff) && (d >= 0xdc00 && d <= 0xdfff) )
+					{
+						sal_uInt64 templow = d&0x03ff;
+						sal_uInt64 temphi = ((c&0x03ff) + 0x0040)<<10;
+						c = temphi|templow;
+						nStrPos++;
+					}
+				}
+				
+				// try to split a line after about 255 characters
+				// at a space character unless in a PRE-context
 				if( ' '==c && !rHTMLWrt.nLastParaToken  )
 				{
 					xub_StrLen nLineLen;
@@ -2642,7 +2655,7 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt,
 					xub_StrLen nWordLen = rStr.Search( ' ', nStrPos+1 );
 					if( nWordLen == STRING_NOTFOUND )
 						nWordLen = nEnde;
-                    nWordLen = nWordLen - nStrPos;
+					nWordLen -= nStrPos;
 
 					if( nLineLen >= rHTMLWrt.nWhishLineLen ||
 						(nLineLen+nWordLen) >= rHTMLWrt.nWhishLineLen )
@@ -2662,13 +2675,20 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt,
 						HTMLOutFuncs::FlushToAscii( rWrt.Strm(), aContext );
 						HTMLOutFuncs::Out_AsciiTag( rWrt.Strm(), OOO_STRING_SVTOOLS_HTML_linebreak );
 					}
+					// #i120442#: if c is outside the unicode base plane output it as "&#******;"
+					else if( c > 0xffff)
+					{
+						ByteString sOut("&#");
+						sOut += ByteString::CreateFromInt64( (sal_uInt64)c );
+						sOut += ';';
+						rWrt.Strm() << sOut.GetBuffer();
+					}
 					else
-						HTMLOutFuncs::Out_Char( rWrt.Strm(), c, aContext, &rHTMLWrt.aNonConvertableCharacters );
+						HTMLOutFuncs::Out_Char( rWrt.Strm(), (sal_Unicode)c, aContext, &rHTMLWrt.aNonConvertableCharacters );
 
-					// Wenn das letzte Zeichen eines Absatzed ein harter
-					// Zeilen-Umbruch ist brauchen wir noch ein <BR> mehr, weil
-					// Netscape & Co in diesem Fall fuer den naechsten Absatz
-					// nicht in die naechste Zeile gehen.
+					// if a paragraph's last character is a hard line break
+					// then we need to add an extra <br>
+					// because browsers like Mozilla wouldn't add a line for the next paragraph
 					bWriteBreak = (0x0a == c) &&
 								  (HTML_PREFORMTXT_ON != rHTMLWrt.nLastParaToken);
 				}