You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by th...@apache.org on 2018/11/28 03:32:50 UTC
[avro] branch master updated: AVRO-2274 Improve resolving
performance when schemas don't change. (#393)
This is an automated email from the ASF dual-hosted git repository.
thiru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/master by this push:
new 6eb2560 AVRO-2274 Improve resolving performance when schemas don't change. (#393)
6eb2560 is described below
commit 6eb25603b96169bf8d77269176218c63c181e9f4
Author: Raymie Stata <rs...@yahoo.com>
AuthorDate: Tue Nov 27 19:32:45 2018 -0800
AVRO-2274 Improve resolving performance when schemas don't change. (#393)
* AVRO-2274 Improve resolving performance when schemas don't change.
* AVRO-2274 Break out of field-no-reorder loop as early as possible.
---
.../java/org/apache/avro/io/ResolvingDecoder.java | 28 ++++++-
.../avro/io/parsing/ResolvingGrammarGenerator.java | 72 ++++++++++++++++--
.../java/org/apache/avro/io/parsing/Symbol.java | 38 ++++++++++
.../org/apache/avro/io/parsing/SymbolTest.java | 4 +-
.../specific/templates/java/classic/record.vm | 31 ++++++--
.../avro/examples/baseball/Player.java | 88 ++++++++++++++--------
.../tools/src/test/compiler/output/Player.java | 88 ++++++++++++++--------
7 files changed, 266 insertions(+), 83 deletions(-)
diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/ResolvingDecoder.java b/lang/java/avro/src/main/java/org/apache/avro/io/ResolvingDecoder.java
index 8f1f6a9..45ff922 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/ResolvingDecoder.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/ResolvingDecoder.java
@@ -130,6 +130,19 @@ public class ResolvingDecoder extends ValidatingDecoder {
}
/**
+ * Same as {@link readFieldOrder} except that it returns
+ * <tt>null</tt> if there was no reordering of fields, i.e., if the
+ * correct thing for the reader to do is to read (all) of its fields
+ * in the order specified by its own schema (useful for
+ * optimizations).
+ */
+ public final Schema.Field[] readFieldOrderIfDiff() throws IOException {
+ Symbol.FieldOrderAction top
+ = (Symbol.FieldOrderAction) parser.advance(Symbol.FIELD_ACTION);
+ return (top.noReorder ? null : top.fields);
+ }
+
+ /**
* Consume any more data that has been written by the writer but not
* needed by the reader so that the the underlying decoder is in proper
* shape for the next record. This situation happens when, for example,
@@ -252,6 +265,7 @@ public class ResolvingDecoder extends ValidatingDecoder {
parser.advance(Symbol.ENUM);
Symbol.EnumAdjustAction top = (Symbol.EnumAdjustAction) parser.popSymbol();
int n = in.readEnum();
+ if (top.noAdjustments) return n;
Object o = top.adjustments[n];
if (o instanceof Integer) {
return ((Integer) o).intValue();
@@ -263,9 +277,17 @@ public class ResolvingDecoder extends ValidatingDecoder {
@Override
public int readIndex() throws IOException {
parser.advance(Symbol.UNION);
- Symbol.UnionAdjustAction top = (Symbol.UnionAdjustAction) parser.popSymbol();
- parser.pushSymbol(top.symToParse);
- return top.rindex;
+ Symbol top = parser.popSymbol();
+ int result;
+ if (top instanceof Symbol.UnionAdjustAction) {
+ result = ((Symbol.UnionAdjustAction) top).rindex;
+ top = ((Symbol.UnionAdjustAction) top).symToParse;
+ } else {
+ result = in.readIndex();
+ top = ((Symbol.Alternative) top).getSymbol(result);
+ }
+ parser.pushSymbol(top);
+ return result;
}
@Override
diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/parsing/ResolvingGrammarGenerator.java b/lang/java/avro/src/main/java/org/apache/avro/io/parsing/ResolvingGrammarGenerator.java
index 7197882..61073dc 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/parsing/ResolvingGrammarGenerator.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/parsing/ResolvingGrammarGenerator.java
@@ -76,8 +76,8 @@ public class ResolvingGrammarGenerator extends ValidatingGrammarGenerator {
* @return The start symbol for the resolving grammar
* @throws IOException
*/
- public Symbol generate(Schema writer, Schema reader,
- Map<LitS, Symbol> seen) throws IOException
+ private Symbol generate(Schema writer, Schema reader, Map<LitS, Symbol> seen)
+ throws IOException
{
final Schema.Type writerType = writer.getType();
final Schema.Type readerType = reader.getType();
@@ -204,6 +204,9 @@ public class ResolvingGrammarGenerator extends ValidatingGrammarGenerator {
private Symbol resolveUnion(Schema writer, Schema reader,
Map<LitS, Symbol> seen) throws IOException {
+ boolean needsAdj = ! unionEquiv(writer, reader, new HashMap<>());
+ List<Schema> alts2 = (!needsAdj ? reader.getTypes() : null);
+
List<Schema> alts = writer.getTypes();
final int size = alts.size();
Symbol[] symbols = new Symbol[size];
@@ -215,12 +218,72 @@ public class ResolvingGrammarGenerator extends ValidatingGrammarGenerator {
*/
int i = 0;
for (Schema w : alts) {
- symbols[i] = generate(w, reader, seen);
+ symbols[i] = generate(w, (needsAdj ? reader : alts2.get(i)), seen);
labels[i] = w.getFullName();
i++;
}
+ if (! needsAdj)
+ return Symbol.seq(Symbol.alt(symbols, labels), Symbol.UNION);
return Symbol.seq(Symbol.alt(symbols, labels),
- Symbol.writerUnionAction());
+ Symbol.WRITER_UNION_ACTION);
+ }
+
+ private static boolean unionEquiv(Schema w, Schema r, Map<LitS, Boolean> seen) {
+ Schema.Type wt = w.getType();
+ if (wt != r.getType()) return false;
+ if ((wt == Schema.Type.RECORD || wt == Schema.Type.FIXED || wt == Schema.Type.ENUM)
+ && ! (w.getFullName() == null || w.getFullName().equals(r.getFullName())))
+ return false;
+
+ switch (w.getType()) {
+ case NULL: case BOOLEAN: case INT: case LONG: case FLOAT: case DOUBLE:
+ case STRING: case BYTES:
+ return true;
+
+ case ARRAY: return unionEquiv(w.getElementType(), r.getElementType(), seen);
+ case MAP: return unionEquiv(w.getValueType(), r.getValueType(), seen);
+
+ case FIXED: return w.getFixedSize() == r.getFixedSize();
+
+ case ENUM: {
+ List<String> ws = w.getEnumSymbols();
+ List<String> rs = r.getEnumSymbols();
+ if (ws.size() != rs.size()) return false;
+ int i = 0;
+ for (i = 0; i < ws.size(); i++)
+ if (! ws.get(i).equals(rs.get(i))) break;
+ return i == ws.size();
+ }
+
+ case UNION: {
+ List<Schema> wb = w.getTypes();
+ List<Schema> rb = r.getTypes();
+ if (wb.size() != rb.size()) return false;
+ int i = 0;
+ for (i = 0; i < wb.size(); i++)
+ if (! unionEquiv(wb.get(i), rb.get(i), seen)) break;
+ return i == wb.size();
+ }
+
+ case RECORD: {
+ LitS wsc = new LitS2(w, r);
+ if (! seen.containsKey(wsc)) {
+ seen.put(wsc, true); // Be optimistic, but we may change our minds
+ List<Field> wb = w.getFields();
+ List<Field> rb = r.getFields();
+ if (wb.size() != rb.size()) seen.put(wsc, false);
+ else {
+ int i = 0;
+ for (i = 0; i < wb.size(); i++)
+ if (! unionEquiv(wb.get(i).schema(), rb.get(i).schema(), seen)) break;
+ seen.put(wsc, (i == wb.size()));
+ }
+ }
+ return seen.get(wsc);
+ }
+ default:
+ throw new IllegalArgumentException("Unknown schema type: " + w.getType());
+ }
}
private Symbol resolveRecords(Schema writer, Schema reader,
@@ -564,4 +627,3 @@ public class ResolvingGrammarGenerator extends ValidatingGrammarGenerator {
}
}
}
-
diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/parsing/Symbol.java b/lang/java/avro/src/main/java/org/apache/avro/io/parsing/Symbol.java
index 1879424..4494ec0 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/parsing/Symbol.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/parsing/Symbol.java
@@ -156,6 +156,27 @@ public abstract class Symbol {
* <tt>Sequence</tt> in the input are replaced by its production recursively.
* Non-<tt>Sequence</tt> symbols, they internally have other symbols
* those internal symbols also get flattened.
+ * When flattening is done, the only place there might be Sequence
+ * symbols is in the productions of a Repeater, Alternative, or the
+ * symToParse and symToSkip in a UnionAdjustAction or SkipAction.
+ *
+ * Why is this done? We want our parsers to be fast. If we left
+ * the grammars unflattened, then the parser would be constantly
+ * copying the contents of nested Sequence productions onto the
+ * parsing stack. Instead, because of flattening, we have a long
+ * top-level production with no Sequences unless the Sequence is
+ * absolutely needed, e.g., in the case of a Repeater or an
+ * Alterantive.
+ *
+ * Well, this is not exactly true when recursion is involved. Where
+ * there is a recursive record, that record will be "inlined" once,
+ * but any internal (ie, recursive) references to that record will
+ * be a Sequence for the record. That Sequence will not further
+ * inline itself -- it will refer to itself as a Sequence. The same
+ * is true for any records nested in this outer recursive record.
+ * Recursion is rare, and we want things to be fast in the typical
+ * case, which is why we do the flattening optimization.
+ *
*
* The algorithm does a few tricks to handle recursive symbol definitions.
* In order to avoid infinite recursion with recursive symbols, we have a map
@@ -490,10 +511,20 @@ public abstract class Symbol {
}
public static class EnumAdjustAction extends IntCheckAction {
+ public final boolean noAdjustments;
public final Object[] adjustments;
@Deprecated public EnumAdjustAction(int rsymCount, Object[] adjustments) {
super(rsymCount);
this.adjustments = adjustments;
+ boolean noAdj = true;
+ if (adjustments != null) {
+ int count = Math.min(rsymCount, adjustments.length);
+ noAdj = (adjustments.length <= rsymCount);
+ for (int i = 0; noAdj && i < count; i++)
+ noAdj &= ((adjustments[i] instanceof Integer)
+ && i == (Integer)adjustments[i]);
+ }
+ this.noAdjustments = noAdj;
}
}
@@ -559,9 +590,14 @@ public abstract class Symbol {
}
public static final class FieldOrderAction extends ImplicitAction {
+ public final boolean noReorder;
public final Schema.Field[] fields;
@Deprecated public FieldOrderAction(Schema.Field[] fields) {
this.fields = fields;
+ boolean noReorder = true;
+ for (int i = 0; noReorder && i < fields.length; i++)
+ noReorder &= (i == fields[i].pos());
+ this.noReorder = noReorder;
}
}
@@ -645,6 +681,8 @@ public abstract class Symbol {
public static final Symbol MAP_END = new Symbol.Terminal("map-end");
public static final Symbol ITEM_END = new Symbol.Terminal("item-end");
+ public static final Symbol WRITER_UNION_ACTION = writerUnionAction();
+
/* a pseudo terminal used by parsers */
public static final Symbol FIELD_ACTION =
new Symbol.Terminal("field-action");
diff --git a/lang/java/avro/src/test/java/org/apache/avro/io/parsing/SymbolTest.java b/lang/java/avro/src/test/java/org/apache/avro/io/parsing/SymbolTest.java
index ce4d7df..f5b9ca0 100644
--- a/lang/java/avro/src/test/java/org/apache/avro/io/parsing/SymbolTest.java
+++ b/lang/java/avro/src/test/java/org/apache/avro/io/parsing/SymbolTest.java
@@ -50,9 +50,7 @@ public class SymbolTest {
@Test
public void testSomeMethod() throws IOException {
Schema schema = new Schema.Parser().parse(SCHEMA);
-
- Symbol root = Symbol.root(new ResolvingGrammarGenerator()
- .generate(schema, schema, new HashMap<>()));
+ Symbol root = new ResolvingGrammarGenerator().generate(schema, schema);
validateNonNull(root, new HashSet<>());
}
diff --git a/lang/java/compiler/src/main/velocity/org/apache/avro/compiler/specific/templates/java/classic/record.vm b/lang/java/compiler/src/main/velocity/org/apache/avro/compiler/specific/templates/java/classic/record.vm
index 523c151..bef1425 100644
--- a/lang/java/compiler/src/main/velocity/org/apache/avro/compiler/specific/templates/java/classic/record.vm
+++ b/lang/java/compiler/src/main/velocity/org/apache/avro/compiler/specific/templates/java/classic/record.vm
@@ -555,25 +555,40 @@ public class ${this.mangle($schema.getName())}#if ($schema.isError()) extends or
@Override protected void customDecode(org.apache.avro.io.ResolvingDecoder in)
throws java.io.IOException
{
- org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrder();
- for (int i = 0; i < $schema.getFields().size(); i++) {
- switch (fieldOrder[i].pos()) {
+ org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrderIfDiff();
+ if (fieldOrder == null) {
+## Common case: order of fields hasn't changed, so read them in a
+## fixed order according to reader's schema
+#set ($nv = 0)## Counter to ensure unique var-names
+#set ($maxnv = 0)## Holds high-water mark during recursion
+#foreach ($field in $schema.getFields())
+#set ($n = $this.mangle($field.name(), $schema.isError()))
+#set ($s = $field.schema())
+#set ($rs = "SCHEMA$.getField(""${n}"").schema()")
+#decodeVar(2 "this.${n}" $s $rs)
+
+#set ($nv = $maxnv)
+#end
+ } else {
+ for (int i = 0; i < $schema.getFields().size(); i++) {
+ switch (fieldOrder[i].pos()) {
#set ($fieldno = 0)
#set ($nv = 0)## Counter to ensure unique var-names
#set ($maxnv = 0)## Holds high-water mark during recursion
#foreach ($field in $schema.getFields())
- case $fieldno:
+ case $fieldno:
#set ($n = $this.mangle($field.name(), $schema.isError()))
#set ($s = $field.schema())
#set ($rs = "SCHEMA$.getField(""${n}"").schema()")
-#decodeVar(4 "this.${n}" $s $rs)
- break;
+#decodeVar(6 "this.${n}" $s $rs)
+ break;
#set ($nv = $maxnv)
#set ($fieldno = $fieldno + 1)
#end
- default:
- throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ default:
+ throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ }
}
}
}
diff --git a/lang/java/tools/src/test/compiler/output-string/avro/examples/baseball/Player.java b/lang/java/tools/src/test/compiler/output-string/avro/examples/baseball/Player.java
index c972235..af6e486 100644
--- a/lang/java/tools/src/test/compiler/output-string/avro/examples/baseball/Player.java
+++ b/lang/java/tools/src/test/compiler/output-string/avro/examples/baseball/Player.java
@@ -503,40 +503,64 @@ public class Player extends org.apache.avro.specific.SpecificRecordBase implemen
@Override protected void customDecode(org.apache.avro.io.ResolvingDecoder in)
throws java.io.IOException
{
- org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrder();
- for (int i = 0; i < 4; i++) {
- switch (fieldOrder[i].pos()) {
- case 0:
- this.number = in.readInt();
- break;
-
- case 1:
- this.first_name = in.readString();
- break;
-
- case 2:
- this.last_name = in.readString();
- break;
-
- case 3:
- long size0 = in.readArrayStart();
- java.util.List<avro.examples.baseball.Position> a0 = this.position;
- if (a0 == null) {
- a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
- this.position = a0;
- } else a0.clear();
- SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
- for ( ; 0 < size0; size0 = in.arrayNext()) {
- for ( ; size0 != 0; size0--) {
- avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
- e0 = avro.examples.baseball.Position.values()[in.readEnum()];
- a0.add(e0);
- }
+ org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrderIfDiff();
+ if (fieldOrder == null) {
+ this.number = in.readInt();
+
+ this.first_name = in.readString();
+
+ this.last_name = in.readString();
+
+ long size0 = in.readArrayStart();
+ java.util.List<avro.examples.baseball.Position> a0 = this.position;
+ if (a0 == null) {
+ a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
+ this.position = a0;
+ } else a0.clear();
+ SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
+ for ( ; 0 < size0; size0 = in.arrayNext()) {
+ for ( ; size0 != 0; size0--) {
+ avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
+ e0 = avro.examples.baseball.Position.values()[in.readEnum()];
+ a0.add(e0);
}
- break;
+ }
+
+ } else {
+ for (int i = 0; i < 4; i++) {
+ switch (fieldOrder[i].pos()) {
+ case 0:
+ this.number = in.readInt();
+ break;
+
+ case 1:
+ this.first_name = in.readString();
+ break;
+
+ case 2:
+ this.last_name = in.readString();
+ break;
+
+ case 3:
+ long size0 = in.readArrayStart();
+ java.util.List<avro.examples.baseball.Position> a0 = this.position;
+ if (a0 == null) {
+ a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
+ this.position = a0;
+ } else a0.clear();
+ SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
+ for ( ; 0 < size0; size0 = in.arrayNext()) {
+ for ( ; size0 != 0; size0--) {
+ avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
+ e0 = avro.examples.baseball.Position.values()[in.readEnum()];
+ a0.add(e0);
+ }
+ }
+ break;
- default:
- throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ default:
+ throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ }
}
}
}
diff --git a/lang/java/tools/src/test/compiler/output/Player.java b/lang/java/tools/src/test/compiler/output/Player.java
index af4e8f7..8376389 100644
--- a/lang/java/tools/src/test/compiler/output/Player.java
+++ b/lang/java/tools/src/test/compiler/output/Player.java
@@ -503,40 +503,64 @@ public class Player extends org.apache.avro.specific.SpecificRecordBase implemen
@Override protected void customDecode(org.apache.avro.io.ResolvingDecoder in)
throws java.io.IOException
{
- org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrder();
- for (int i = 0; i < 4; i++) {
- switch (fieldOrder[i].pos()) {
- case 0:
- this.number = in.readInt();
- break;
-
- case 1:
- this.first_name = in.readString(this.first_name instanceof Utf8 ? (Utf8)this.first_name : null);
- break;
-
- case 2:
- this.last_name = in.readString(this.last_name instanceof Utf8 ? (Utf8)this.last_name : null);
- break;
-
- case 3:
- long size0 = in.readArrayStart();
- java.util.List<avro.examples.baseball.Position> a0 = this.position;
- if (a0 == null) {
- a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
- this.position = a0;
- } else a0.clear();
- SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
- for ( ; 0 < size0; size0 = in.arrayNext()) {
- for ( ; size0 != 0; size0--) {
- avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
- e0 = avro.examples.baseball.Position.values()[in.readEnum()];
- a0.add(e0);
- }
+ org.apache.avro.Schema.Field[] fieldOrder = in.readFieldOrderIfDiff();
+ if (fieldOrder == null) {
+ this.number = in.readInt();
+
+ this.first_name = in.readString(this.first_name instanceof Utf8 ? (Utf8)this.first_name : null);
+
+ this.last_name = in.readString(this.last_name instanceof Utf8 ? (Utf8)this.last_name : null);
+
+ long size0 = in.readArrayStart();
+ java.util.List<avro.examples.baseball.Position> a0 = this.position;
+ if (a0 == null) {
+ a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
+ this.position = a0;
+ } else a0.clear();
+ SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
+ for ( ; 0 < size0; size0 = in.arrayNext()) {
+ for ( ; size0 != 0; size0--) {
+ avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
+ e0 = avro.examples.baseball.Position.values()[in.readEnum()];
+ a0.add(e0);
}
- break;
+ }
+
+ } else {
+ for (int i = 0; i < 4; i++) {
+ switch (fieldOrder[i].pos()) {
+ case 0:
+ this.number = in.readInt();
+ break;
+
+ case 1:
+ this.first_name = in.readString(this.first_name instanceof Utf8 ? (Utf8)this.first_name : null);
+ break;
+
+ case 2:
+ this.last_name = in.readString(this.last_name instanceof Utf8 ? (Utf8)this.last_name : null);
+ break;
+
+ case 3:
+ long size0 = in.readArrayStart();
+ java.util.List<avro.examples.baseball.Position> a0 = this.position;
+ if (a0 == null) {
+ a0 = new SpecificData.Array<avro.examples.baseball.Position>((int)size0, SCHEMA$.getField("position").schema());
+ this.position = a0;
+ } else a0.clear();
+ SpecificData.Array<avro.examples.baseball.Position> ga0 = (a0 instanceof SpecificData.Array ? (SpecificData.Array<avro.examples.baseball.Position>)a0 : null);
+ for ( ; 0 < size0; size0 = in.arrayNext()) {
+ for ( ; size0 != 0; size0--) {
+ avro.examples.baseball.Position e0 = (ga0 != null ? ga0.peek() : null);
+ e0 = avro.examples.baseball.Position.values()[in.readEnum()];
+ a0.add(e0);
+ }
+ }
+ break;
- default:
- throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ default:
+ throw new java.io.IOException("Corrupt ResolvingDecoder.");
+ }
}
}
}