You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by ma...@apache.org on 2010/03/04 03:15:53 UTC

svn commit: r918818 - in /hadoop/avro/trunk: CHANGES.txt lang/c/src/Makefile.am lang/c/src/avro.h lang/c/src/datum_size.c lang/c/src/encoding.h lang/c/src/encoding_binary.c lang/c/tests/test_avro_data.c lang/c/version.sh

Author: massie
Date: Thu Mar  4 02:15:53 2010
New Revision: 918818

URL: http://svn.apache.org/viewvc?rev=918818&view=rev
Log:
AVRO-445. avro_size_data() to pre-calculate the size of an avro_datum_t in serialized form. Contributed by Bruce Mitchener.

Added:
    hadoop/avro/trunk/lang/c/src/datum_size.c
Modified:
    hadoop/avro/trunk/CHANGES.txt
    hadoop/avro/trunk/lang/c/src/Makefile.am
    hadoop/avro/trunk/lang/c/src/avro.h
    hadoop/avro/trunk/lang/c/src/encoding.h
    hadoop/avro/trunk/lang/c/src/encoding_binary.c
    hadoop/avro/trunk/lang/c/tests/test_avro_data.c
    hadoop/avro/trunk/lang/c/version.sh

Modified: hadoop/avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/CHANGES.txt?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/CHANGES.txt (original)
+++ hadoop/avro/trunk/CHANGES.txt Thu Mar  4 02:15:53 2010
@@ -16,6 +16,9 @@
 
     AVRO-438. Clarify spec.  (Amichai Rothman via cutting)
 
+    AVRO-445. avro_size_data() to pre-calculate the size of an 
+    avro_datum_t in serialized form (Bruce Mitchener via massie)
+
   BUG FIXES
 
     AVRO-424. Fix the specification of the deflate codec.

Modified: hadoop/avro/trunk/lang/c/src/Makefile.am
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/src/Makefile.am?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/src/Makefile.am (original)
+++ hadoop/avro/trunk/lang/c/src/Makefile.am Thu Mar  4 02:15:53 2010
@@ -7,7 +7,7 @@
 
 lib_LTLIBRARIES = libavro.la
 libavro_la_SOURCES = st.c st.h schema.c schema.h schema_equal.c \
-datum.c datum_equal.c datum_validate.c datum_read.c datum_skip.c datum_write.c datum.h \
+datum.c datum_equal.c datum_validate.c datum_read.c datum_skip.c datum_write.c datum_size.c datum.h \
 io.c dump.c dump.h encoding_binary.c \
 avro_private.h encoding.h datafile.c
 libavro_la_LIBADD = $(top_builddir)/jansson/src/.libs/libjansson.a

Modified: hadoop/avro/trunk/lang/c/src/avro.h
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/src/avro.h?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/src/avro.h (original)
+++ hadoop/avro/trunk/lang/c/src/avro.h Thu Mar  4 02:15:53 2010
@@ -257,6 +257,8 @@
 int avro_skip_data(avro_reader_t reader, avro_schema_t writer_schema);
 int avro_write_data(avro_writer_t writer,
 		    avro_schema_t writer_schema, avro_datum_t datum);
+int64_t avro_size_data(avro_writer_t writer,
+		       avro_schema_t writer_schema, avro_datum_t datum);
 
 /* File object container */
 typedef struct avro_file_reader_t *avro_file_reader_t;

Added: hadoop/avro/trunk/lang/c/src/datum_size.c
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/src/datum_size.c?rev=918818&view=auto
==============================================================================
--- hadoop/avro/trunk/lang/c/src/datum_size.c (added)
+++ hadoop/avro/trunk/lang/c/src/datum_size.c Thu Mar  4 02:15:53 2010
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0 
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.  See the License for the specific language governing
+ * permissions and limitations under the License. 
+ */
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include "schema.h"
+#include "datum.h"
+#include "encoding.h"
+
+#define size_check(rval, call) { rval = call; if(rval) return rval; }
+#define size_accum(rval, size, call) { rval = call; if (rval < 0) return rval; else size += rval; }
+
+static int64_t size_datum(avro_writer_t writer, const avro_encoding_t * enc,
+			  avro_schema_t writers_schema, avro_datum_t datum);
+
+static int64_t
+size_record(avro_writer_t writer, const avro_encoding_t * enc,
+	    struct avro_record_schema_t *schema, avro_datum_t datum)
+{
+	int rval;
+	long i;
+	int64_t size;
+	avro_datum_t field_datum;
+
+	size = 0;
+	if (schema) {
+		for (i = 0; i < schema->fields->num_entries; i++) {
+			union {
+				st_data_t data;
+				struct avro_record_field_t *field;
+			} val;
+			st_lookup(schema->fields, i, &val.data);
+			size_check(rval,
+				   avro_record_get(datum, val.field->name,
+						   &field_datum));
+			size_accum(rval, size,
+				   size_datum(writer, enc, val.field->type,
+					      field_datum));
+		}
+	} else {
+		/* No schema.  Just write the record datum */
+		struct avro_record_datum_t *record =
+		    avro_datum_to_record(datum);
+		for (i = 0; i < record->field_order->num_entries; i++) {
+			union {
+				st_data_t data;
+				char *name;
+			} val;
+			st_lookup(record->field_order, i, &val.data);
+			size_check(rval,
+				   avro_record_get(datum, val.name,
+						   &field_datum));
+			size_accum(rval, size,
+				   size_datum(writer, enc, NULL, field_datum));
+		}
+	}
+	return size;
+}
+
+static int64_t
+size_enum(avro_writer_t writer, const avro_encoding_t * enc,
+	  struct avro_enum_schema_t *enump, struct avro_enum_datum_t *datum)
+{
+	return enc->size_long(writer, datum->value);
+}
+
+struct size_map_args {
+	int rval;
+	int64_t size;
+	avro_writer_t writer;
+	const avro_encoding_t *enc;
+	avro_schema_t values_schema;
+};
+
+static int
+size_map_foreach(char *key, avro_datum_t datum, struct size_map_args *args)
+{
+	int rval = args->enc->size_string(args->writer, key);
+	if (rval < 0) {
+		args->rval = rval;
+		return ST_STOP;
+	} else {
+		args->size += rval;
+	}
+	rval = size_datum(args->writer, args->enc, args->values_schema, datum);
+	if (rval < 0) {
+		args->rval = rval;
+		return ST_STOP;
+	} else {
+		args->size += rval;
+	}
+	return ST_CONTINUE;
+}
+
+static int64_t
+size_map(avro_writer_t writer, const avro_encoding_t * enc,
+	 struct avro_map_schema_t *writers_schema,
+	 struct avro_map_datum_t *datum)
+{
+	int rval;
+	int64_t size;
+	struct size_map_args args = { 0, 0, writer, enc,
+		writers_schema ? writers_schema->values : NULL
+	};
+
+	size = 0;
+	if (datum->map->num_entries) {
+		size_accum(rval, size,
+			   enc->size_long(writer, datum->map->num_entries));
+		st_foreach(datum->map, size_map_foreach, (st_data_t) & args);
+		size += args.size;
+	}
+	if (!args.rval) {
+		size_accum(rval, size, enc->size_long(writer, 0));
+	}
+	return size;
+}
+
+static int64_t
+size_array(avro_writer_t writer, const avro_encoding_t * enc,
+	   struct avro_array_schema_t *schema, struct avro_array_datum_t *array)
+{
+	int rval;
+	long i;
+	int64_t size;
+
+	size = 0;
+	if (array->els->num_entries) {
+		size_accum(rval, size,
+			   enc->size_long(writer, array->els->num_entries));
+		for (i = 0; i < array->els->num_entries; i++) {
+			union {
+				st_data_t data;
+				avro_datum_t datum;
+			} val;
+			st_lookup(array->els, i, &val.data);
+			size_accum(rval, size,
+				   size_datum(writer, enc,
+					      schema ? schema->items : NULL,
+					      val.datum));
+		}
+	}
+	size_accum(rval, size, enc->size_long(writer, 0));
+	return size;
+}
+
+static int64_t
+size_union(avro_writer_t writer, const avro_encoding_t * enc,
+	   struct avro_union_schema_t *schema,
+	   struct avro_union_datum_t *unionp)
+{
+	int rval;
+	int64_t size;
+	avro_schema_t write_schema = NULL;
+
+	size = 0;
+	size_accum(rval, size, enc->size_long(writer, unionp->discriminant));
+	if (schema) {
+		union {
+			st_data_t data;
+			avro_schema_t schema;
+		} val;
+		if (!st_lookup
+		    (schema->branches, unionp->discriminant, &val.data)) {
+			return -EINVAL;
+		}
+		write_schema = val.schema;
+	}
+	size_accum(rval, size,
+		   size_datum(writer, enc, write_schema, unionp->value));
+	return size;
+}
+
+static int64_t size_datum(avro_writer_t writer, const avro_encoding_t * enc,
+			  avro_schema_t writers_schema, avro_datum_t datum)
+{
+	if (is_avro_schema(writers_schema) && is_avro_link(writers_schema)) {
+		return size_datum(writer, enc,
+				  (avro_schema_to_link(writers_schema))->to,
+				  datum);
+	}
+
+	switch (avro_typeof(datum)) {
+	case AVRO_NULL:
+		return enc->size_null(writer);
+
+	case AVRO_BOOLEAN:
+		return enc->size_boolean(writer,
+					 avro_datum_to_boolean(datum)->i);
+
+	case AVRO_STRING:
+		return enc->size_string(writer, avro_datum_to_string(datum)->s);
+
+	case AVRO_BYTES:
+		return enc->size_bytes(writer,
+				       avro_datum_to_bytes(datum)->bytes,
+				       avro_datum_to_bytes(datum)->size);
+
+	case AVRO_INT32:
+	case AVRO_INT64:{
+			int64_t val = avro_typeof(datum) == AVRO_INT32 ?
+			    avro_datum_to_int32(datum)->i32 :
+			    avro_datum_to_int64(datum)->i64;
+			if (is_avro_schema(writers_schema)) {
+				/* handle promotion */
+				if (is_avro_float(writers_schema)) {
+					return enc->size_float(writer,
+							       (float)val);
+				} else if (is_avro_double(writers_schema)) {
+					return enc->size_double(writer,
+								(double)val);
+				}
+			}
+			return enc->size_long(writer, val);
+		}
+
+	case AVRO_FLOAT:{
+			float val = avro_datum_to_float(datum)->f;
+			if (is_avro_schema(writers_schema)
+			    && is_avro_double(writers_schema)) {
+				/* handle promotion */
+				return enc->size_double(writer, (double)val);
+			}
+			return enc->size_float(writer, val);
+		}
+
+	case AVRO_DOUBLE:
+		return enc->size_double(writer, avro_datum_to_double(datum)->d);
+
+	case AVRO_RECORD:
+		return size_record(writer, enc,
+				   avro_schema_to_record(writers_schema),
+				   datum);
+
+	case AVRO_ENUM:
+		return size_enum(writer, enc,
+				 avro_schema_to_enum(writers_schema),
+				 avro_datum_to_enum(datum));
+
+	case AVRO_FIXED:
+		return avro_datum_to_fixed(datum)->size;
+
+	case AVRO_MAP:
+		return size_map(writer, enc,
+				avro_schema_to_map(writers_schema),
+				avro_datum_to_map(datum));
+
+	case AVRO_ARRAY:
+		return size_array(writer, enc,
+				  avro_schema_to_array(writers_schema),
+				  avro_datum_to_array(datum));
+
+	case AVRO_UNION:
+		return size_union(writer, enc,
+				  avro_schema_to_union(writers_schema),
+				  avro_datum_to_union(datum));
+
+	case AVRO_LINK:
+		break;
+	}
+
+	return 0;
+}
+
+int64_t avro_size_data(avro_writer_t writer, avro_schema_t writers_schema,
+		       avro_datum_t datum)
+{
+	if (!writer || !is_avro_datum(datum)) {
+		return -EINVAL;
+	}
+	/* Only validate datum if a writer's schema is provided */
+	if (is_avro_schema(writers_schema)
+	    && !avro_schema_datum_validate(writers_schema, datum)) {
+		return -EINVAL;
+	}
+	return size_datum(writer, &avro_binary_encoding, writers_schema, datum);
+}

Modified: hadoop/avro/trunk/lang/c/src/encoding.h
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/src/encoding.h?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/src/encoding.h (original)
+++ hadoop/avro/trunk/lang/c/src/encoding.h Thu Mar  4 02:15:53 2010
@@ -29,6 +29,7 @@
 	int (*read_string) (avro_reader_t reader, char **s);
 	int (*skip_string) (avro_reader_t reader);
 	int (*write_string) (avro_writer_t writer, const char *s);
+	 int64_t(*size_string) (avro_writer_t writer, const char *s);
 	/*
 	 * bytes 
 	 */
@@ -36,42 +37,50 @@
 	int (*skip_bytes) (avro_reader_t reader);
 	int (*write_bytes) (avro_writer_t writer,
 			    const char *bytes, const int64_t len);
+	 int64_t(*size_bytes) (avro_writer_t writer,
+			       const char *bytes, const int64_t len);
 	/*
 	 * int 
 	 */
 	int (*read_int) (avro_reader_t reader, int32_t * i);
 	int (*skip_int) (avro_reader_t reader);
 	int (*write_int) (avro_writer_t writer, const int32_t i);
+	 int64_t(*size_int) (avro_writer_t writer, const int32_t i);
 	/*
 	 * long 
 	 */
 	int (*read_long) (avro_reader_t reader, int64_t * l);
 	int (*skip_long) (avro_reader_t reader);
 	int (*write_long) (avro_writer_t writer, const int64_t l);
+	 int64_t(*size_long) (avro_writer_t writer, const int64_t l);
 	/*
 	 * float 
 	 */
 	int (*read_float) (avro_reader_t reader, float *f);
 	int (*skip_float) (avro_reader_t reader);
 	int (*write_float) (avro_writer_t writer, const float f);
+	 int64_t(*size_float) (avro_writer_t writer, const float f);
 	/*
 	 * double 
 	 */
 	int (*read_double) (avro_reader_t reader, double *d);
 	int (*skip_double) (avro_reader_t reader);
 	int (*write_double) (avro_writer_t writer, const double d);
+	 int64_t(*size_double) (avro_writer_t writer, const double d);
 	/*
 	 * boolean 
 	 */
 	int (*read_boolean) (avro_reader_t reader, int8_t * b);
 	int (*skip_boolean) (avro_reader_t reader);
 	int (*write_boolean) (avro_writer_t writer, const int8_t b);
+	 int64_t(*size_boolean) (avro_writer_t writer, const int8_t b);
 	/*
 	 * null 
 	 */
 	int (*read_null) (avro_reader_t reader);
 	int (*skip_null) (avro_reader_t reader);
 	int (*write_null) (avro_writer_t writer);
+	 int64_t(*size_null) (avro_writer_t writer);
 };
 typedef struct avro_encoding_t avro_encoding_t;
 

Modified: hadoop/avro/trunk/lang/c/src/encoding_binary.c
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/src/encoding_binary.c?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/src/encoding_binary.c (original)
+++ hadoop/avro/trunk/lang/c/src/encoding_binary.c Thu Mar  4 02:15:53 2010
@@ -72,6 +72,18 @@
 	return 0;
 }
 
+static int64_t size_long(avro_writer_t writer, int64_t l)
+{
+	int64_t len = 0;
+	uint64_t n = (l << 1) ^ (l >> 63);
+	while (n & ~0x7F) {
+		len++;
+		n >>= 7;
+	}
+	len++;
+	return len;
+}
+
 static int read_int(avro_reader_t reader, int32_t * i)
 {
 	int64_t l;
@@ -97,6 +109,12 @@
 	return write_long(writer, l);
 }
 
+static int64_t size_int(avro_writer_t writer, const int32_t i)
+{
+	int64_t l = i;
+	return size_long(writer, l);
+}
+
 static int read_bytes(avro_reader_t reader, char **bytes, int64_t * len)
 {
 	int rval = read_long(reader, len);
@@ -138,6 +156,12 @@
 	return 0;
 }
 
+static int64_t
+size_bytes(avro_writer_t writer, const char *bytes, const int64_t len)
+{
+	return size_long(writer, len) + len;
+}
+
 static int read_string(avro_reader_t reader, char **s)
 {
 	int64_t len;
@@ -155,6 +179,12 @@
 	return write_bytes(writer, s, len);
 }
 
+static int64_t size_string(avro_writer_t writer, const char *s)
+{
+	int64_t len = strlen(s);
+	return size_bytes(writer, s, len);
+}
+
 static int read_float(avro_reader_t reader, float *f)
 {
 #if WORDS_BIGENDIAN
@@ -205,6 +235,11 @@
 	return 0;
 }
 
+static int64_t size_float(avro_writer_t writer, const float f)
+{
+	return 4;
+}
+
 static int read_double(avro_reader_t reader, double *d)
 {
 #if WORDS_BIGENDIAN
@@ -264,6 +299,11 @@
 	return 0;
 }
 
+static int64_t size_double(avro_writer_t writer, const double d)
+{
+	return 8;
+}
+
 static int read_boolean(avro_reader_t reader, int8_t * b)
 {
 	AVRO_READ(reader, b, 1);
@@ -282,6 +322,11 @@
 	return 0;
 }
 
+static int64_t size_boolean(avro_writer_t writer, const int8_t b)
+{
+	return 1;
+}
+
 static int read_skip_null(avro_reader_t reader)
 {
 	/*
@@ -298,6 +343,11 @@
 	return 0;
 }
 
+static int64_t size_null(avro_writer_t writer)
+{
+	return 0;
+}
+
 const avro_encoding_t avro_binary_encoding = {
 	.description = "BINARY FORMAT",
 	/*
@@ -306,46 +356,54 @@
 	.read_string = read_string,
 	.skip_string = skip_string,
 	.write_string = write_string,
+	.size_string = size_string,
 	/*
 	 * bytes 
 	 */
 	.read_bytes = read_bytes,
 	.skip_bytes = skip_bytes,
 	.write_bytes = write_bytes,
+	.size_bytes = size_bytes,
 	/*
 	 * int 
 	 */
 	.read_int = read_int,
 	.skip_int = skip_int,
 	.write_int = write_int,
+	.size_int = size_int,
 	/*
 	 * long 
 	 */
 	.read_long = read_long,
 	.skip_long = skip_long,
 	.write_long = write_long,
+	.size_long = size_long,
 	/*
 	 * float 
 	 */
 	.read_float = read_float,
 	.skip_float = skip_float,
 	.write_float = write_float,
+	.size_float = size_float,
 	/*
 	 * double 
 	 */
 	.read_double = read_double,
 	.skip_double = skip_double,
 	.write_double = write_double,
+	.size_double = size_double,
 	/*
 	 * boolean 
 	 */
 	.read_boolean = read_boolean,
 	.skip_boolean = skip_boolean,
 	.write_boolean = write_boolean,
+	.size_boolean = size_boolean,
 	/*
 	 * null 
 	 */
 	.read_null = read_skip_null,
 	.skip_null = read_skip_null,
-	.write_null = write_null
+	.write_null = write_null,
+	.size_null = size_null
 };

Modified: hadoop/avro/trunk/lang/c/tests/test_avro_data.c
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/tests/test_avro_data.c?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/tests/test_avro_data.c (original)
+++ hadoop/avro/trunk/lang/c/tests/test_avro_data.c Thu Mar  4 02:15:53 2010
@@ -67,6 +67,15 @@
 				type, validate);
 			exit(EXIT_FAILURE);
 		}
+		int64_t size =
+		    avro_size_data(writer, validate ? writers_schema : NULL,
+				   datum);
+		if (size != avro_writer_tell(writer)) {
+			fprintf(stderr,
+				"Unable to calculate size %s validate=%d (%lld != %lld)\n",
+				type, validate, size, avro_writer_tell(writer));
+			exit(EXIT_FAILURE);
+		}
 		if (avro_read_data
 		    (reader, writers_schema, readers_schema, &datum_out)) {
 			fprintf(stderr, "Unable to read %s validate=%d\n", type,

Modified: hadoop/avro/trunk/lang/c/version.sh
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/lang/c/version.sh?rev=918818&r1=918817&r2=918818&view=diff
==============================================================================
--- hadoop/avro/trunk/lang/c/version.sh (original)
+++ hadoop/avro/trunk/lang/c/version.sh Thu Mar  4 02:15:53 2010
@@ -18,9 +18,9 @@
 #         libavro_binary_age = 0
 #         libavro_interface_age = 0
 #
-libavro_micro_version=18
+libavro_micro_version=19
 libavro_interface_age=0
-libavro_binary_age=0
+libavro_binary_age=1
 
 # IGNORE EVERYTHING ELSE FROM HERE DOWN.........
 if test $# != 1; then