You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@avro.apache.org by "Kousuke Saruta (Jira)" <ji...@apache.org> on 2023/08/23 15:51:00 UTC
[jira] [Updated] (AVRO-3841) Align the specification of encoding NaN to the actual implementations

     [ https://issues.apache.org/jira/browse/AVRO-3841?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Kousuke Saruta updated AVRO-3841:
---------------------------------
    Issue Type: Improvement  (was: Bug)

> Align the specification of encoding NaN to the actual implementations
> ---------------------------------------------------------------------
>
>                 Key: AVRO-3841
>                 URL: https://issues.apache.org/jira/browse/AVRO-3841
>             Project: Apache Avro
>          Issue Type: Improvement
>          Components: spec
>    Affects Versions: 1.12.0
>            Reporter: Kousuke Saruta
>            Priority: Minor
>
> The specification says about the way to encode float/double like as follows.
> {code}
> a float is written as 4 bytes. The float is converted into a 32-bit integer using a method equivalent to Java’s floatToIntBits and then encoded in little-endian format.
> a double is written as 8 bytes. The double is converted into a 64-bit integer using a method equivalent to Java’s doubleToLongBits and then encoded in little-endian format.
> {code}
> But the actual implementation in Java uses floatToRawIntBits/doubleToRawLongBits rather than floatToIntBits/doubleToLongBits.
> The they are different in the way to encode NaN.
> floatToIntBits/doubleToLongBits doesn't distinguish between NaN and -NaN but floatToRawIntBits/doubleToRawLongBits does.
> I confirmed all the implementation distinguish between NaN and -NaN.
> So, I think it's better to modify the specification.
> Java
> {code}
>   public static int encodeFloat(float f, byte[] buf, int pos) {
>     final int bits = Float.floatToRawIntBits(f);
>     buf[pos + 3] = (byte) (bits >>> 24);
>     buf[pos + 2] = (byte) (bits >>> 16);
>     buf[pos + 1] = (byte) (bits >>> 8);
>     buf[pos] = (byte) (bits);
>     return 4;
>   }
>   public static int encodeDouble(double d, byte[] buf, int pos) {
>     final long bits = Double.doubleToRawLongBits(d);
>     int first = (int) (bits & 0xFFFFFFFF);
>     int second = (int) ((bits >>> 32) & 0xFFFFFFFF);
>     // the compiler seems to execute this order the best, likely due to
>     // register allocation -- the lifetime of constants is minimized.
>     buf[pos] = (byte) (first);
>     buf[pos + 4] = (byte) (second);
>     buf[pos + 5] = (byte) (second >>> 8);
>     buf[pos + 1] = (byte) (first >>> 8);
>     buf[pos + 2] = (byte) (first >>> 16);
>     buf[pos + 6] = (byte) (second >>> 16);
>     buf[pos + 7] = (byte) (second >>> 24);
>     buf[pos + 3] = (byte) (first >>> 24);
>     return 8;
>   }
> {code}
> Rust
> {code}
> Value::Float(x) => buffer.extend_from_slice(&x.to_le_bytes()),
> Value::Double(x) => buffer.extend_from_slice(&x.to_le_bytes()),
> {code}
> Python
> {code}
>     def write_float(self, datum: float) -> None:                                                                                                  
>         """                                                                                                                                       
>         A float is written as 4 bytes.                                                                                                            
>         The float is converted into a 32-bit integer using a method equivalent to                                                                 
>         Java's floatToIntBits and then encoded in little-endian format.                                                                           
>         """                                                                                                                                       
>         self.write(STRUCT_FLOAT.pack(datum)) 
>     def write_double(self, datum: float) -> None:                                                                                                 
>         """                                                                                                                                       
>         A double is written as 8 bytes.                                                                                                           
>         The double is converted into a 64-bit integer using a method equivalent to                                                                
>         Java's doubleToLongBits and then encoded in little-endian format.                                                                         
>         """                                                                                                                                       
>         self.write(STRUCT_DOUBLE.pack(datum))
> {code}
> C
> {code}
> static int write_float(avro_writer_t writer, const float f)
> {
> #if AVRO_PLATFORM_IS_BIG_ENDIAN
>         uint8_t buf[4];
> #endif
>         union {
>                 float f;
>                 int32_t i;
>         } v;
>         v.f = f;
> #if AVRO_PLATFORM_IS_BIG_ENDIAN
>         buf[0] = (uint8_t) (v.i >> 0);
>         buf[1] = (uint8_t) (v.i >> 8);
>         buf[2] = (uint8_t) (v.i >> 16);
>         buf[3] = (uint8_t) (v.i >> 24);
>         AVRO_WRITE(writer, buf, 4);
> #else
>         AVRO_WRITE(writer, (void *)&v.i, 4);
> #endif
>         return 0;
> }
> static int write_double(avro_writer_t writer, const double d)
> {
> #if AVRO_PLATFORM_IS_BIG_ENDIAN
>         uint8_t buf[8];
> #endif
>         union {
>                 double d;
>                 int64_t l;
>         } v;
>         v.d = d;
> #if AVRO_PLATFORM_IS_BIG_ENDIAN
>         buf[0] = (uint8_t) (v.l >> 0);
>         buf[1] = (uint8_t) (v.l >> 8);
>         buf[2] = (uint8_t) (v.l >> 16);
>         buf[3] = (uint8_t) (v.l >> 24);
>         buf[4] = (uint8_t) (v.l >> 32);
>         buf[5] = (uint8_t) (v.l >> 40);
>         buf[6] = (uint8_t) (v.l >> 48);
>         buf[7] = (uint8_t) (v.l >> 56);
>         AVRO_WRITE(writer, buf, 8);
> #else
>         AVRO_WRITE(writer, (void *)&v.l, 8);
> #endif
>         return 0;
> }
> {code}
> C++
> {code}
> void BinaryEncoder::encodeFloat(float f) {
>     const auto *p = reinterpret_cast<const uint8_t *>(&f);
>     out_.writeBytes(p, sizeof(float));
> }
> void BinaryEncoder::encodeDouble(double d) {
>     const auto *p = reinterpret_cast<const uint8_t *>(&d);
>     out_.writeBytes(p, sizeof(double));
> }
> {code}
> C#
> {code}
>         public void WriteFloat(float value)
>         {
>             byte[] buffer = BitConverter.GetBytes(value);
>             if (!BitConverter.IsLittleEndian) Array.Reverse(buffer);
>             writeBytes(buffer);
>         }
>         public void WriteDouble(double value)
>         {
>             long bits = BitConverter.DoubleToInt64Bits(value);
>             writeByte((byte)(bits & 0xFF));
>             writeByte((byte)((bits >> 8) & 0xFF));
>             writeByte((byte)((bits >> 16) & 0xFF));
>             writeByte((byte)((bits >> 24) & 0xFF));
>             writeByte((byte)((bits >> 32) & 0xFF));
>             writeByte((byte)((bits >> 40) & 0xFF));
>             writeByte((byte)((bits >> 48) & 0xFF));
>             writeByte((byte)((bits >> 56) & 0xFF));
>         }
> {code}
> Ruby
> {code}
>       def read_float
>         # A float is written as 4 bytes.
>         # The float is converted into a 32-bit integer using a method
>         # equivalent to Java's floatToRawIntBits and then encoded in
>         # little-endian format.
>         read_and_unpack(4, 'e')
>       end
>       def read_double
>         #  A double is written as 8 bytes.
>         # The double is converted into a 64-bit integer using a method
>         # equivalent to Java's doubleToRawLongBits and then encoded in
>         # little-endian format.
>         read_and_unpack(8, 'E')
>       end
> {code}
> Perl
> {code}
> sub encode_float {
>     my $class = shift;
>     my ($schema, $data, $cb) = @_;
>     my $enc = pack "f<", $data;
>     $cb->(\$enc);
> }
> sub encode_double {
>     my $class = shift;
>     my ($schema, $data, $cb) = @_;
>     my $enc = pack "d<", $data;
>     $cb->(\$enc);
> }
> {code}
> PHP
> {code}
>     public static function floatToIntBits($float)
>     {
>         return pack('g', (float) $float);
>     }
>     public static function doubleToLongBits($double)
>     {
>         return pack('e', (double) $double);
>     }
> {code}
> JavaScript
> {code}
> Tap.prototype.writeFloat = function (f) {
>   var buf = this.buf;
>   var pos = this.pos;
>   this.pos += 4;
>   if (this.pos > buf.length) {
>     return;
>   }
>   return this.buf.writeFloatLE(f, pos);
> };
> Tap.prototype.writeDouble = function (d) {
>   var buf = this.buf;
>   var pos = this.pos;
>   this.pos += 8;
>   if (this.pos > buf.length) {
>     return;
>   }
>   return this.buf.writeDoubleLE(d, pos);
> };
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)