You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by sl...@apache.org on 2019/06/27 12:21:15 UTC
[incubator-daffodil] branch master updated: Fixes decoding unicode
with surrogate pairs
This is an automated email from the ASF dual-hosted git repository.
slawrence pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-daffodil.git
The following commit(s) were added to refs/heads/master by this push:
new 5b7230c Fixes decoding unicode with surrogate pairs
5b7230c is described below
commit 5b7230c48ee532df13dae07287c8775e3a861655
Author: Olabusayo Kilo <ok...@tresys.com>
AuthorDate: Wed Jun 26 10:38:53 2019 -0400
Fixes decoding unicode with surrogate pairs
Fixes decoding function for unicode characters that decode to a surrogate
pair
DAFFODIL-2123
---
.../main/scala/org/apache/daffodil/io/Dump.scala | 88 +++++++++++++++-------
.../scala/org/apache/daffodil/io/TestDump.scala | 25 ++++++
2 files changed, 86 insertions(+), 27 deletions(-)
diff --git a/daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala b/daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala
index e41d965..0e2c6eb 100644
--- a/daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala
+++ b/daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala
@@ -27,6 +27,7 @@ import org.apache.daffodil.util.Misc
import org.apache.daffodil.equality._
import java.nio.charset.{ CharsetDecoder => JavaCharsetDecoder }
import java.nio.charset.{ Charset => JavaCharset }
+import java.nio.charset.CoderResult
/**
* Hex/Bits and text dump formats for debug/trace purposes.
@@ -147,7 +148,7 @@ class DataDumper {
txtsb ++= paddingFromPriorLine
while (i <= limit0b) {
val bytePos0b = addr + i
- val (char, nBytesConsumed, width) = convertToChar(bytePos0b, endByteAddress0b, byteSource, decoder)
+ val (charRep, nBytesConsumed, width) = convertToCharRepr(bytePos0b, endByteAddress0b, byteSource, decoder)
Assert.invariant(nBytesConsumed > 0)
// some characters will print double width. It is assumed all such
// characters occupy at least one byte.
@@ -187,7 +188,7 @@ class DataDumper {
case (n, x) => Assert.impossible()
})
val trimmedPadding = padding.take(padding.length - paddingFromPriorLine.length)
- txtsb ++= char.toString + trimmedPadding
+ txtsb ++= charRep + trimmedPadding
i += nBytesConsumed
}
}
@@ -422,8 +423,8 @@ class DataDumper {
* relative to a regular monospaced font character. This is for trying to get
* east asian and other double-wide characters to line up properly in columns.
*/
- private def charNColumns(char: Char): Int = {
- val charWidth = UCharacter.getIntPropertyValue(char, UProperty.EAST_ASIAN_WIDTH)
+ private def charNColumns(codepoint: Int): Int = {
+ val charWidth = UCharacter.getIntPropertyValue(codepoint, UProperty.EAST_ASIAN_WIDTH)
charWidth match {
//
// see http://unicode.org/reports/tr11/tr11-8.html
@@ -440,27 +441,23 @@ class DataDumper {
private def getReplacingDecoder(optEncodingName: Option[String]): Option[JavaCharsetDecoder] = {
val cs = optEncodingName.map { JavaCharset.forName(_) }
lazy val decoder = cs.map { _.newDecoder() }
- decoder foreach { d =>
- d.onMalformedInput(CodingErrorAction.REPLACE)
- d.onUnmappableCharacter(CodingErrorAction.REPLACE)
- }
decoder
}
/**
* Decoder must be setup for REPLACE on decode error.
*/
- private def convertToChar(
+ private def convertToCharRepr(
startingBytePos0b: Long,
endingBytePos0b: Long,
bs: ByteSource,
- decoder: Option[JavaCharsetDecoder]): (Char, Int, Int) = {
+ decoder: Option[JavaCharsetDecoder]): (String, Int, Int) = {
Assert.invariant(decoder.map { d => Misc.isAsciiBased(d.charset()) }.getOrElse(true))
decoder match {
case Some(dec) => {
val bb = ByteBuffer.allocate(6)
- val cb = CharBuffer.allocate(1)
+ var cb = CharBuffer.allocate(1)
val lastAvailableBytePos0b = scala.math.min(endingBytePos0b, startingBytePos0b + 5) // widest possible char representation is 6 bytes.
val nBytes = (lastAvailableBytePos0b - startingBytePos0b).toInt + 1
Assert.invariant(nBytes > 0) // have to have at least 1 byte left
@@ -476,25 +473,62 @@ class DataDumper {
}
bb.flip()
Assert.invariant(bb.remaining > 0)
- val cr = dec.decode(bb, cb, true)
- if (cr.isOverflow || cr.isUnderflow) {
- // An overflow means that we got our one character, but there were more bytes available that could
- // be decoded. We're not interested in those.
+ var cr = CoderResult.OVERFLOW
+ var nConsumedBytes = 0
+ var remapped = ""
+ var nCols = 0
+ do {
+ // An overflow means we were able to start to decode at least 1 sequence of characters, but there was either insufficient
+ // space in the output buffer to store said decoded char or there were left over bytes after parsing. If it is
+ // the former, we can proceed and we'll get the left over bytes on the next run, if it was the latter
+ // (as can be the case with decoding a 4 byte character sequence), we will call decode with a larger buffer
+ // until we consume something or the output buffer is at same capacity as input buffer
+ cr = dec.decode(bb, cb, true)
+ nConsumedBytes = bb.position()
+ if (cr.isOverflow && nConsumedBytes == 0) {
+ cb = CharBuffer.allocate(cb.capacity + 1)
+ }
+ } while (cr.isOverflow && nConsumedBytes == 0 && cb.capacity <= bb.capacity)
+
+ // Once we leave the loop, we will either have consumed bytes to process (with a variety of left over bytes that we
+ // don't care about) or malformed/unmappable results with no consumed bytes that we do care about so we will do a
+ // manual replace and set consumed bytes ourselves. We should not do an automatic replace as it creates ambiguity
+ // with the malformed/unmapped/consumed bytes with our current implementation of handling a decoded character at a time.
+
+ // We should never have an underflow condition with no bytes consumed. As that would indicate it needs more input than
+ // we've provided. Even if we only provide 1 byte of a 4 byte sequence, it will return a malformed[1]
+ Assert.invariant(!(cr.isUnderflow && nConsumedBytes == 0))
+
+ if ((cr.isMalformed || cr.isUnmappable) && nConsumedBytes == 0) {
+ //do manual replacement
+ remapped = dec.replacement()
+ // grab malformed/unmappable byte so we can keep decoding
+ nConsumedBytes = cr.length
+ nCols = charNColumns(remapped(0))
+ } else {
+ // An overflow, at this point, means that we got our one character, but there were more bytes available that could
+ // be decoded. We're not interested in those right now.
//
// An underflow means that we got our one character, but the bytes were exactly used up
// by constructing that one character.
//
// Either way, we got our one character
- // how many bytes did it consume?
- val nConsumedBytes = bb.position()
Assert.invariant(nConsumedBytes > 0)
- val char = cb.get(0)
- val nCols = charNColumns(char)
- val charInt = cb.get(0)
- val remapped = Misc.remapCodepointToVisibleGlyph(charInt).toChar
- (remapped, nConsumedBytes, nCols)
- } else
- Assert.invariantFailed("decode should only terminate with OVERFLOW or UNDERFLOW. Was: " + cr)
+ Assert.invariant(cb.hasArray)
+ var allChars = cb.array
+ remapped = if (allChars.length > 1) allChars.mkString else Misc.remapCodepointToVisibleGlyph(allChars(0)).toChar.toString
+ nCols = if (allChars.length > 1) {
+ try {
+ var uCodePoint = UCharacter.getCodePoint(allChars(0), allChars(1))
+ charNColumns(uCodePoint)
+ } catch {
+ case e: IllegalArgumentException => {
+ allChars.mkString.length
+ }
+ }
+ } else charNColumns(allChars(0))
+ }
+ (remapped, nConsumedBytes, nCols)
}
case None => {
// no encoding, so use the general one based on windows-1252 where
@@ -510,7 +544,7 @@ class DataDumper {
// FIXME: This will be really broken for EBCDIC-based encodings. Pass the encoding
// so that the glyph routine can be ascii/ebcdic sensitive.
val remapped = Misc.remapByteToVisibleGlyph(byteValue)
- (remapped.toChar, 1, 1)
+ (remapped.toChar.toString, 1, 1)
}
}
}
@@ -568,8 +602,8 @@ class DataDumper {
var i = startByteAddress0b
val sb = new StringBuilder
while (i <= endByteAddress0b) {
- val (c, _, _) = convertToChar(i - startByteAddress0b, endByteAddress0b, byteSource, decoder)
- sb += c
+ val (cR, _, _) = convertToCharRepr(i - startByteAddress0b, endByteAddress0b, byteSource, decoder)
+ sb += cR(0)
i += 1
}
val s = sb.mkString
diff --git a/daffodil-io/src/test/scala/org/apache/daffodil/io/TestDump.scala b/daffodil-io/src/test/scala/org/apache/daffodil/io/TestDump.scala
index 5e331b8..86f3404 100644
--- a/daffodil-io/src/test/scala/org/apache/daffodil/io/TestDump.scala
+++ b/daffodil-io/src/test/scala/org/apache/daffodil/io/TestDump.scala
@@ -197,6 +197,31 @@ class TestDump {
assertEquals(expected, "\n" + dumpString + "\n")
}
+ @Test def testDumpHexAndText4() {
+
+ val bytes =
+ """da8b f090 a487 f48b be8b be7a 1234 4567 f48b 8018 0156
+dada 0000 0101 0817 ece2 8017 ece2 dead beef cc7a 1234
+4567 f48b"""
+ .replaceAll("\\s+", "").grouped(2)
+ .map { Integer.parseInt(_, 16).toByte }.toArray
+ val lengthInBits = bytes.length * 8
+ val bs = new BS(bytes)
+ val dumpString = Dump.dump(Dump.MixedHexLTR(Some("utf-8")), 0, lengthInBits, bs,
+ includeHeadingLine = true).mkString("\n")
+ val u068b = Character.toChars(0x068b).mkString
+ val u10907 = Character.toChars(0x10907).mkString
+ val u10bf8b = Character.toChars(0x10bf8b).mkString
+ val u07ad = Character.toChars(0x07ad).mkString
+ val expected = s"""
+87654321 0011 2233 4455 6677 8899 aabb ccdd eeff 0~1~2~3~4~5~6~7~8~9~a~b~c~d~e~f~
+00000000: da8b f090 a487 f48b be8b be7a 1234 4567 ${u068b}~~~${u10907}~~~~~~~${u10bf8b}~~~~~~~�~z~␒~4~E~g~
+00000010: f48b 8018 0156 dada 0000 0101 0817 ece2 �~~~~~␘~␁~V~�~�~␀~␀~␁~␁~␈~␗~�~�~
+00000020: 8017 ece2 dead beef cc7a 1234 4567 f48b ~~␗~�~�~${u07ad}~~~�~�~�~z~␒~4~E~g~�~~~
+""".replace("\r\n", "\n")
+ assertEquals(expected, "\n" + dumpString + "\n")
+ }
+
@Test def testDump1() {
val bs = new BS((0 to 255).map { _.toByte }.toArray)