You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@avro.apache.org by "Kousuke Saruta (Jira)" <ji...@apache.org> on 2023/08/23 15:51:00 UTC
[jira] [Created] (AVRO-3841) Align the specification of encoding NaN to the actual implementations
Kousuke Saruta created AVRO-3841:
------------------------------------
Summary: Align the specification of encoding NaN to the actual implementations
Key: AVRO-3841
URL: https://issues.apache.org/jira/browse/AVRO-3841
Project: Apache Avro
Issue Type: Bug
Components: spec
Affects Versions: 1.12.0
Reporter: Kousuke Saruta
The specification says about the way to encode float/double like as follows.
{code}
a float is written as 4 bytes. The float is converted into a 32-bit integer using a method equivalent to Java’s floatToIntBits and then encoded in little-endian format.
a double is written as 8 bytes. The double is converted into a 64-bit integer using a method equivalent to Java’s doubleToLongBits and then encoded in little-endian format.
{code}
But the actual implementation in Java uses floatToRawIntBits/doubleToRawLongBits rather than floatToIntBits/doubleToLongBits.
The they are different in the way to encode NaN.
floatToIntBits/doubleToLongBits doesn't distinguish between NaN and -NaN but floatToRawIntBits/doubleToRawLongBits does.
I confirmed all the implementation distinguish between NaN and -NaN.
So, I think it's better to modify the specification.
Java
{code}
public static int encodeFloat(float f, byte[] buf, int pos) {
final int bits = Float.floatToRawIntBits(f);
buf[pos + 3] = (byte) (bits >>> 24);
buf[pos + 2] = (byte) (bits >>> 16);
buf[pos + 1] = (byte) (bits >>> 8);
buf[pos] = (byte) (bits);
return 4;
}
public static int encodeDouble(double d, byte[] buf, int pos) {
final long bits = Double.doubleToRawLongBits(d);
int first = (int) (bits & 0xFFFFFFFF);
int second = (int) ((bits >>> 32) & 0xFFFFFFFF);
// the compiler seems to execute this order the best, likely due to
// register allocation -- the lifetime of constants is minimized.
buf[pos] = (byte) (first);
buf[pos + 4] = (byte) (second);
buf[pos + 5] = (byte) (second >>> 8);
buf[pos + 1] = (byte) (first >>> 8);
buf[pos + 2] = (byte) (first >>> 16);
buf[pos + 6] = (byte) (second >>> 16);
buf[pos + 7] = (byte) (second >>> 24);
buf[pos + 3] = (byte) (first >>> 24);
return 8;
}
{code}
Rust
{code}
Value::Float(x) => buffer.extend_from_slice(&x.to_le_bytes()),
Value::Double(x) => buffer.extend_from_slice(&x.to_le_bytes()),
{code}
Python
{code}
def write_float(self, datum: float) -> None:
"""
A float is written as 4 bytes.
The float is converted into a 32-bit integer using a method equivalent to
Java's floatToIntBits and then encoded in little-endian format.
"""
self.write(STRUCT_FLOAT.pack(datum))
def write_double(self, datum: float) -> None:
"""
A double is written as 8 bytes.
The double is converted into a 64-bit integer using a method equivalent to
Java's doubleToLongBits and then encoded in little-endian format.
"""
self.write(STRUCT_DOUBLE.pack(datum))
{code}
C
{code}
static int write_float(avro_writer_t writer, const float f)
{
#if AVRO_PLATFORM_IS_BIG_ENDIAN
uint8_t buf[4];
#endif
union {
float f;
int32_t i;
} v;
v.f = f;
#if AVRO_PLATFORM_IS_BIG_ENDIAN
buf[0] = (uint8_t) (v.i >> 0);
buf[1] = (uint8_t) (v.i >> 8);
buf[2] = (uint8_t) (v.i >> 16);
buf[3] = (uint8_t) (v.i >> 24);
AVRO_WRITE(writer, buf, 4);
#else
AVRO_WRITE(writer, (void *)&v.i, 4);
#endif
return 0;
}
static int write_double(avro_writer_t writer, const double d)
{
#if AVRO_PLATFORM_IS_BIG_ENDIAN
uint8_t buf[8];
#endif
union {
double d;
int64_t l;
} v;
v.d = d;
#if AVRO_PLATFORM_IS_BIG_ENDIAN
buf[0] = (uint8_t) (v.l >> 0);
buf[1] = (uint8_t) (v.l >> 8);
buf[2] = (uint8_t) (v.l >> 16);
buf[3] = (uint8_t) (v.l >> 24);
buf[4] = (uint8_t) (v.l >> 32);
buf[5] = (uint8_t) (v.l >> 40);
buf[6] = (uint8_t) (v.l >> 48);
buf[7] = (uint8_t) (v.l >> 56);
AVRO_WRITE(writer, buf, 8);
#else
AVRO_WRITE(writer, (void *)&v.l, 8);
#endif
return 0;
}
{code}
C++
{code}
void BinaryEncoder::encodeFloat(float f) {
const auto *p = reinterpret_cast<const uint8_t *>(&f);
out_.writeBytes(p, sizeof(float));
}
void BinaryEncoder::encodeDouble(double d) {
const auto *p = reinterpret_cast<const uint8_t *>(&d);
out_.writeBytes(p, sizeof(double));
}
{code}
C#
{code}
public void WriteFloat(float value)
{
byte[] buffer = BitConverter.GetBytes(value);
if (!BitConverter.IsLittleEndian) Array.Reverse(buffer);
writeBytes(buffer);
}
public void WriteDouble(double value)
{
long bits = BitConverter.DoubleToInt64Bits(value);
writeByte((byte)(bits & 0xFF));
writeByte((byte)((bits >> 8) & 0xFF));
writeByte((byte)((bits >> 16) & 0xFF));
writeByte((byte)((bits >> 24) & 0xFF));
writeByte((byte)((bits >> 32) & 0xFF));
writeByte((byte)((bits >> 40) & 0xFF));
writeByte((byte)((bits >> 48) & 0xFF));
writeByte((byte)((bits >> 56) & 0xFF));
}
{code}
Ruby
{code}
def read_float
# A float is written as 4 bytes.
# The float is converted into a 32-bit integer using a method
# equivalent to Java's floatToRawIntBits and then encoded in
# little-endian format.
read_and_unpack(4, 'e')
end
def read_double
# A double is written as 8 bytes.
# The double is converted into a 64-bit integer using a method
# equivalent to Java's doubleToRawLongBits and then encoded in
# little-endian format.
read_and_unpack(8, 'E')
end
{code}
Perl
{code}
sub encode_float {
my $class = shift;
my ($schema, $data, $cb) = @_;
my $enc = pack "f<", $data;
$cb->(\$enc);
}
sub encode_double {
my $class = shift;
my ($schema, $data, $cb) = @_;
my $enc = pack "d<", $data;
$cb->(\$enc);
}
{code}
PHP
{code}
public static function floatToIntBits($float)
{
return pack('g', (float) $float);
}
public static function doubleToLongBits($double)
{
return pack('e', (double) $double);
}
{code}
JavaScript
{code}
Tap.prototype.writeFloat = function (f) {
var buf = this.buf;
var pos = this.pos;
this.pos += 4;
if (this.pos > buf.length) {
return;
}
return this.buf.writeFloatLE(f, pos);
};
Tap.prototype.writeDouble = function (d) {
var buf = this.buf;
var pos = this.pos;
this.pos += 8;
if (this.pos > buf.length) {
return;
}
return this.buf.writeDoubleLE(d, pos);
};
{code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)