You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2020/05/02 19:57:13 UTC

[GitHub] [drill] vvysotskyi commented on a change in pull request #2068: DRILL-7717: Support Mongo extended types in V2 JSON loader

vvysotskyi commented on a change in pull request #2068:
URL: https://github.com/apache/drill/pull/2068#discussion_r418995949



##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/extended/ExtendedTypeFieldFactory.java
##########
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.easy.json.extended;
+
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.record.metadata.MetadataUtils;
+import org.apache.drill.exec.store.easy.json.loader.BaseFieldFactory;
+import org.apache.drill.exec.store.easy.json.loader.FieldDefn;
+import org.apache.drill.exec.store.easy.json.loader.FieldFactory;
+import org.apache.drill.exec.store.easy.json.loader.JsonLoaderImpl;
+import org.apache.drill.exec.store.easy.json.parser.ElementParser;
+import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
+import org.apache.drill.exec.store.easy.json.parser.ValueParser;
+import org.apache.drill.exec.store.easy.json.values.BinaryValueListener;
+import org.apache.drill.exec.store.easy.json.values.UtcDateValueListener;
+import org.apache.drill.exec.store.easy.json.values.DecimalValueListener;
+import org.apache.drill.exec.store.easy.json.values.IntervalValueListener;
+import org.apache.drill.exec.store.easy.json.values.StrictBigIntValueListener;
+import org.apache.drill.exec.store.easy.json.values.StrictDoubleValueListener;
+import org.apache.drill.exec.store.easy.json.values.StrictIntValueListener;
+import org.apache.drill.exec.store.easy.json.values.StrictStringValueListener;
+import org.apache.drill.exec.store.easy.json.values.TimeValueListener;
+import org.apache.drill.exec.store.easy.json.values.UtcTimestampValueListener;
+
+import com.fasterxml.jackson.core.JsonToken;
+
+public class ExtendedTypeFieldFactory extends BaseFieldFactory {
+
+  public ExtendedTypeFieldFactory(JsonLoaderImpl loader, FieldFactory child) {
+    super(loader, child);
+  }
+
+  @Override
+  public ElementParser fieldParser(FieldDefn fieldDefn) {
+    ElementParser parser = buildExtendedTypeParser(fieldDefn);
+    if (parser == null) {
+      return child.fieldParser(fieldDefn);
+    } else {
+      return parser;
+    }
+  }
+
+  private ElementParser buildExtendedTypeParser(FieldDefn fieldDefn) {
+
+    // Extended types are objects: { "$type": ... }
+    // Extended arrays are [ { "$type": ...
+    TokenIterator tokenizer = fieldDefn.tokenizer();
+    JsonToken token = tokenizer.requireNext();
+    ElementParser parser;
+    switch (token) {
+      case START_OBJECT:
+        parser = extendedTypeParserFor(fieldDefn, false);
+        break;
+      case START_ARRAY:
+        parser = arrayParserFor(fieldDefn);
+        break;
+      default:
+        parser = null;
+    }
+    tokenizer.unget(token);
+    return parser;
+  }
+
+  private ElementParser arrayParserFor(FieldDefn fieldDefn) {
+    TokenIterator tokenizer = fieldDefn.tokenizer();
+    JsonToken token = tokenizer.requireNext();
+    if (token != JsonToken.START_OBJECT) {
+      tokenizer.unget(token);
+      return null;
+    }
+
+    ValueParser element = extendedTypeParserFor(fieldDefn, true);
+    tokenizer.unget(token);
+    if (element == null) {
+      return null;
+    }
+
+    return scalarArrayParserFor(element);
+  }
+
+  private BaseExtendedValueParser extendedTypeParserFor(FieldDefn fieldDefn, boolean isArray) {
+    TokenIterator tokenizer = fieldDefn.tokenizer();
+    JsonToken token = tokenizer.peek();
+    if (token != JsonToken.FIELD_NAME) {
+      return null;
+    }
+
+    String key = tokenizer.textValue().trim();
+    if (!key.startsWith(ExtendedTypeNames.TYPE_PREFIX)) {
+      return null;
+    }
+    return parserFor(fieldDefn, key, isArray);
+  }
+
+  private BaseExtendedValueParser parserFor(FieldDefn fieldDefn, String key, boolean isArray) {
+    switch (key) {
+    case ExtendedTypeNames.LONG:
+      return numberLongParser(fieldDefn, isArray);
+    case ExtendedTypeNames.DECIMAL:
+      return numberDecimalParser(fieldDefn, isArray);
+    case ExtendedTypeNames.DOUBLE:
+      return numberDoubleParser(fieldDefn, isArray);
+    case ExtendedTypeNames.INT:
+      return numberIntParser(fieldDefn, isArray);
+    case ExtendedTypeNames.DATE:
+      return dateParser(fieldDefn, isArray);
+    case ExtendedTypeNames.BINARY:
+    case ExtendedTypeNames.BINARY_TYPE:
+      return binaryParser(fieldDefn, isArray);
+    case ExtendedTypeNames.OBJECT_ID:
+      return oidParser(fieldDefn, isArray);
+    case ExtendedTypeNames.DATE_DAY:
+      return dateDayParser(fieldDefn, isArray);
+    case ExtendedTypeNames.TIME:
+      return timeParser(fieldDefn, isArray);
+    case ExtendedTypeNames.INTERVAL:
+      return intervalParser(fieldDefn, isArray);
+    default:
+      return null;
+    }
+  }
+
+  private BaseExtendedValueParser numberLongParser(FieldDefn fieldDefn, boolean isArray) {
+    return new SimpleExtendedValueParser(
+        fieldDefn.parser(), ExtendedTypeNames.LONG,
+        new StrictBigIntValueListener(loader(),
+            fieldDefn.scalarWriterFor(MinorType.BIGINT, isArray)));
+  }
+
+  private BaseExtendedValueParser numberDecimalParser(FieldDefn fieldDefn, boolean isArray) {
+    // No information about precision and scale, so guess (38, 10).

Review comment:
       Is it possible to obtain information about scale and precision from `fieldDefn`? For the case of the schema provisioning users should be able to specify target scale and precision.

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/extended/MongoBinaryValueParser.java
##########
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.easy.json.extended;
+
+import org.apache.drill.exec.store.easy.json.parser.JsonStructureParser;
+import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
+import org.apache.drill.exec.store.easy.json.values.ScalarListener;
+
+import com.fasterxml.jackson.core.JsonToken;
+
+/**
+ * Parsers a binary. Ignores the subtype field.</pre>
+ */
+public class MongoBinaryValueParser extends BaseExtendedValueParser {
+
+  protected static final String BINARY_HINT =
+      "{\"$binary\": {base64: (\"<payload>\", subType: \"<t>\" }) | " +
+        "(\"<payload>\", \"$type\": \"<t>\") }";
+
+  public MongoBinaryValueParser(JsonStructureParser structParser, ScalarListener listener) {
+    super(structParser, listener);
+  }
+
+  @Override
+  protected String typeName() { return ExtendedTypeNames.BINARY; }
+
+  @Override
+  public void parse(TokenIterator tokenizer) {
+
+    // Null: assume the value is null
+    // (Extension to extended types)
+    JsonToken token = tokenizer.requireNext();
+    if (token == JsonToken.VALUE_NULL) {
+      listener.onValue(token, tokenizer);
+      return;
+    }
+
+    // Value is a scalar, assume binary value as a string.
+    // This is a harmless extension to the standard.
+     if (token.isScalarValue()) {
+      listener.onValue(token, tokenizer);
+      return;
+    }
+
+    // Must be an object
+    requireToken(token, JsonToken.START_OBJECT);
+
+    // { ^($binary | $type)
+    requireToken(tokenizer, JsonToken.FIELD_NAME);
+    String fieldName = tokenizer.textValue();
+    if (fieldName.equals(ExtendedTypeNames.BINARY_TYPE)) {
+      // { $type ^
+      parseV1Format(fieldName, tokenizer);
+    } else if (!fieldName.equals(ExtendedTypeNames.BINARY)) {
+      throw syntaxError();
+    } else if (tokenizer.peek() == JsonToken.START_OBJECT) {
+      // { $binary: ^{
+      parseV2Format(tokenizer);
+    } else {
+      // { $binary: ^value
+      parseV1Format(fieldName, tokenizer);
+    }
+  }
+
+  // Parse field: { ($binary | $type) ^ ...
+  private void parseV1Format(String fieldName, TokenIterator tokenizer) {
+    boolean sawData = false;
+    for (;;) {
+      // key: ^value
+      JsonToken token = requireScalar(tokenizer);
+      if (fieldName.equals(ExtendedTypeNames.BINARY)) {
+        if (sawData) {
+          syntaxError();
+        }
+        sawData = true;
+        listener.onValue(token, tokenizer);
+      }
+
+      // key: value ^(} | key ...)
+      token = tokenizer.requireNext();
+      if (token == JsonToken.END_OBJECT) {
+        break;
+      } if (token != JsonToken.FIELD_NAME) {
+        syntaxError();
+      }
+      fieldName = tokenizer.textValue();
+      switch (fieldName) {
+      case ExtendedTypeNames.BINARY:
+      case ExtendedTypeNames.BINARY_TYPE:
+        break;
+      default:
+        syntaxError();
+      }
+    }
+    if (!sawData) {
+      syntaxError();
+    }
+  }
+
+  // Parse field: { $binary: ^{ "base64": "<payload>", "subType": "<t>" } }
+  // With fields in either order
+  private void parseV2Format(TokenIterator tokenizer) {
+    boolean sawData = false;
+    requireToken(tokenizer, JsonToken.START_OBJECT);
+    for (;;) {
+      JsonToken token = tokenizer.requireNext();
+      if (token == JsonToken.END_OBJECT) {
+        break;
+      } else if (token != JsonToken.FIELD_NAME) {
+        throw syntaxError();
+      }
+      switch (tokenizer.textValue()) {
+        case "base64":
+          if (sawData) {
+            syntaxError();

Review comment:
       Looks like `throw` is missed here and in other places.
   ```suggestion
               throw syntaxError();
   ```

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/loader/JsonLoaderImpl.java
##########
@@ -222,18 +232,41 @@ private JsonLoaderImpl(JsonLoaderBuilder builder) {
     this.rsLoader = builder.rsLoader;
     this.options = builder.options;
     this.errorContext = builder. errorContext;
-    this.rowListener = new TupleListener(this, rsLoader.writer(), builder.providedSchema);
-    this.parser = new JsonStructureParserBuilder()
+    this.fieldFactory = buildFieldFactory(builder);
+    this.parser = buildParser(builder);
+  }
+
+  private JsonStructureParser buildParser(JsonLoaderBuilder builder) {
+    return new JsonStructureParserBuilder()
             .fromStream(builder.stream)
             .fromReader(builder.reader)
             .options(builder.options)
-            .rootListener(rowListener)
+            .parserFactory(new ParserFactory() {
+              @Override
+              public ObjectParser rootParser(JsonStructureParser parser) {
+                return new TupleParser(parser, JsonLoaderImpl.this, rsLoader.writer(), builder.providedSchema);
+              }
+            })

Review comment:
       Please replace it with lambda:
   ```suggestion
               .parserFactory(parser -> new TupleParser(parser, JsonLoaderImpl.this, rsLoader.writer(), builder.providedSchema))
   ```

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/ObjectParser.java
##########
@@ -82,26 +100,83 @@
  * the array is multi-dimensional, there will be multiple array/value
  * parser pairs: one for each dimension.
  */
-public class ObjectParser extends AbstractElementParser {
-  private final ObjectListener listener;
+public abstract class ObjectParser extends AbstractElementParser {
+  protected static final Logger logger = LoggerFactory.getLogger(ObjectParser.class);
+
   private final Map<String, ElementParser> members = CaseInsensitiveMap.newHashMap();
 
-  public ObjectParser(ElementParser parent, ObjectListener listener) {
-    super(parent);
-    this.listener = listener;
+  public ObjectParser(JsonStructureParser structParser) {
+    super(structParser);
   }
 
-  public ObjectListener listener() { return listener; }
+  @VisibleForTesting
+  public ElementParser fieldParser(String key) {
+    return members.get(key);
+  }
+
+  /**
+   * Called at the start of a set of values for an object. That is, called
+   * when the structure parser accepts the <code>{</code> token.
+   */
+  protected void onStart() { }
+
+  /**
+   * The structure parser has just encountered a new field for this
+   * object. This method returns a parser for the field, along with
+   * an optional listener to handle events within the field. THe field typically

Review comment:
       ```suggestion
      * an optional listener to handle events within the field. The field typically
   ```

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/FieldParserFactory.java
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.easy.json.parser;
+
+import org.apache.drill.exec.store.easy.json.parser.ArrayValueParser.LenientArrayValueParser;
+import org.apache.drill.exec.store.easy.json.parser.JsonStructureParser.ParserFactory;
+import org.apache.drill.exec.store.easy.json.parser.ScalarValueParser.SimpleValueParser;
+import org.apache.drill.exec.store.easy.json.parser.ScalarValueParser.TextValueParser;
+import org.apache.drill.exec.store.easy.json.parser.ValueDef.JsonType;
+
+
+/**
+ * Creates a field parser given a field description and an optional field
+ * listener.
+ * <p>
+ * Parse position: <code>{ ... field : ^ ?</code> for a newly-seen field.
+ * Constructs a value parser and its listeners by looking ahead
+ * some number of tokens to "sniff" the type of the value. For
+ * example:
+ * <ul>
+ * <li>{@code foo: <value>} - Field value</li>
+ * <li>{@code foo: [ <value> ]} - 1D array value</li>
+ * <li>{@code foo: [ [<value> ] ]} - 2D array value</li>
+ * <li>Etc.</li>
+ * </ul>
+ * <p>
+ * There are two cases in which no type estimation is possible:
+ * <ul>
+ * <li>The value is {@code null}, indicated by
+ * {@link JsonType#NULL}.</code>
+ * <li>The value is an array, and the array is empty, indicated
+ * by {@link JsonType#EMPTY}.</li>
+ * </ul>
+ * {@link ValueDefFactory} handles syntactic type inference. The associated
+ * listener enforces semantic rules. For example, if a schema is
+ * available, and we know that field "x" must be an Integer, but
+ * this class reports that it is an object, then the listener should
+ * raise an exception.
+ * <p>
+ * Also, the parser cannot enforce type consistency. This method
+ * looks only at the first appearance of a value: a sample size of
+ * one. JSON allows anything.
+ * The listener must enforce semantic rules that say whether a different
+ * type is allowed for later values.
+ *
+ * @param key the name of the field

Review comment:
       `@param` tags shouldn't appear in class javadocs...

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/values/UtcTimestampValueListener.java
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.easy.json.values;
+
+import java.time.Instant;
+import java.time.ZoneId;
+
+import org.apache.drill.exec.store.easy.json.loader.JsonLoaderImpl;
+import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+
+import com.fasterxml.jackson.core.JsonToken;
+
+/**
+ * Per the <a href="https://docs.mongodb.com/manual/reference/mongodb-extended-json-v1/#bson.data_date">
+ * V1 docs</a>:
+ * <quote>
+ * In Strict mode, {@code <date>} is an ISO-8601 date format with a mandatory time zone field
+ * following the template YYYY-MM-DDTHH:mm:ss.mmm<+/-Offset>.
+ * <p>
+ * In Shell mode, {@code <date>} is the JSON representation of a 64-bit signed
+ * integer giving the number of milliseconds since epoch UTC.
+ * </quote>
+ * <p>
+ * Drill dates are in the local time zone, so conversion is needed.
+ */
+public class UtcTimestampValueListener extends ScalarListener {
+
+  private final ZoneId localZoneId = ZoneId.systemDefault();

Review comment:
       It is safe to make it static.

##########
File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/loader/EmptyArrayFieldParser.java
##########
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.easy.json.loader;
+
+import org.apache.drill.exec.store.easy.json.loader.JsonLoaderImpl.NullTypeMarker;
+import org.apache.drill.exec.store.easy.json.parser.ElementParser;
+import org.apache.drill.exec.store.easy.json.parser.EmptyArrayParser;
+import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
+
+import com.fasterxml.jackson.core.JsonToken;
+
+/**
+ * Represents a run of empty arrays for which we have no type information.
+ * Resolves to an actual type when a non-empty array appears. Must resolve
+ * the array even if it contains nulls so we can count the null values.
+ */
+public class EmptyArrayFieldParser extends EmptyArrayParser implements NullTypeMarker {
+
+  private final TupleParser tupleParser;
+
+  public EmptyArrayFieldParser(TupleParser tupleParser, String key) {
+    super(tupleParser.structParser(), key);
+    this.tupleParser = tupleParser;
+    tupleParser.loader().addNullMarker(this);
+  }
+
+  @Override
+  public void forceResolution() {
+    tupleParser.loader().removeNullMarker(this);
+    tupleParser.forceEmptyArrayResolution(key);
+  }
+
+  /**
+   * The column type is now known from context. Create a new array
+   * column, writer and parser to replace this parser.
+   */
+  @Override
+  protected ElementParser resolve(TokenIterator tokenizer) {
+    tupleParser.loader().removeNullMarker(this);
+    if (tokenizer.peek() == JsonToken.START_ARRAY) {
+
+      // The value is [], [ foo, so resolve directly
+      return tupleParser.resolveField(key, tokenizer);
+    } else {

Review comment:
       Could you please remove the empty block?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org