You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/03 17:23:56 UTC

[GitHub] [arrow] pitrou opened a new pull request #10862: ARROW-13132: [C++] Add Scalar validation

pitrou opened a new pull request #10862:
URL: https://github.com/apache/arrow/pull/10862


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682851194



##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       I'd say `is_valid==true`

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       will do




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682884744



##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       https://issues.apache.org/jira/browse/ARROW-9006




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682716498



##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       `child_ids` is a mapping of type codes to (physical) child ids. What you are thinking about is `type_codes`, which is a mapping of (physical) child ids to type codes.

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       I have no idea. Can you open a JIRA for that?

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       In this case, should `DictionaryScalar::is_valid` be true or false?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682851489



##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       ah, nevermind




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682664506



##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting

Review comment:
       ```suggestion
       const int type_code = s.type_code;  // avoid 8-bit int types for printing
   ```

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       IIUC, `UnionScalar::type_code` contains one of the values in `child_ids` and not the index of one of the values in `child_ids` so it must be validated using `std::find` or so instead of by comparison to `child_ids.size()`

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?

Review comment:
       SGTM, please do

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       For completeness, would you add a case where a valid index encodes a null dictionary entry?
   ```suggestion
     auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma", null])");
   ```

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }

Review comment:
       Nit: de-nest this for clarity
   ```suggestion
       if (s.is_valid && !s.value.index->is_valid) {
         return Status::Invalid("non-null ", s.type->ToString(),
                                " scalar has null index value");
       }
   
       if (!s.is_valid && s.value.index->is_valid) {
         return Status::Invalid("null ", s.type->ToString(),
                                " scalar has non-null index value");
       }
   ```

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       Now that compute::Cast supports scalars for all types, is it worth keeping Scalar::Parse and Scalar::CastTo?

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       I'd say `is_valid==true`

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       will do

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       ah, nevermind

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       https://issues.apache.org/jira/browse/ARROW-9006




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682664506



##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting

Review comment:
       ```suggestion
       const int type_code = s.type_code;  // avoid 8-bit int types for printing
   ```

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       IIUC, `UnionScalar::type_code` contains one of the values in `child_ids` and not the index of one of the values in `child_ids` so it must be validated using `std::find` or so instead of by comparison to `child_ids.size()`

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?

Review comment:
       SGTM, please do

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       For completeness, would you add a case where a valid index encodes a null dictionary entry?
   ```suggestion
     auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma", null])");
   ```

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }

Review comment:
       Nit: de-nest this for clarity
   ```suggestion
       if (s.is_valid && !s.value.index->is_valid) {
         return Status::Invalid("non-null ", s.type->ToString(),
                                " scalar has null index value");
       }
   
       if (!s.is_valid && s.value.index->is_valid) {
         return Status::Invalid("null ", s.type->ToString(),
                                " scalar has non-null index value");
       }
   ```

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       Now that compute::Cast supports scalars for all types, is it worth keeping Scalar::Parse and Scalar::CastTo?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #10862:
URL: https://github.com/apache/arrow/pull/10862


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682724321



##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -902,6 +1214,37 @@ TEST(TestDictionaryScalar, Basics) {
   }
 }
 
+TEST(TestDictionaryScalar, ValidateErrors) {
+  auto index_ty = int16();
+  auto value_ty = utf8();
+  auto dict = ArrayFromJSON(value_ty, R"(["alpha", "beta", "gamma"])");

Review comment:
       In this case, should `DictionaryScalar::is_valid` be true or false?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#issuecomment-895334946


   I believe I addressed all review comments. Will merge if CI passes.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#issuecomment-892025469


   https://issues.apache.org/jira/browse/ARROW-13132


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682723742



##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -37,14 +37,52 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) {
+  const auto scalar = MakeNullScalar(type);
+  ARROW_EXPECT_OK(scalar->Validate());
+  ARROW_EXPECT_OK(scalar->ValidateFull());
+  AssertTypeEqual(*type, *scalar->type);
+  EXPECT_FALSE(scalar->is_valid);
+  return scalar;
+}
+
+template <typename... MakeScalarArgs>
+void AssertMakeScalar(const Scalar& expected, MakeScalarArgs&&... args) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(std::forward<MakeScalarArgs>(args)...));
+  ASSERT_OK(scalar->Validate());
+  ASSERT_OK(scalar->ValidateFull());
+  AssertScalarsEqual(expected, *scalar, /*verbose=*/true);
+}
+
+void AssertParseScalar(const std::shared_ptr<DataType>& type, const util::string_view& s,

Review comment:
       I have no idea. Can you open a JIRA for that?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10862: ARROW-13132: [C++] Add Scalar validation

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10862:
URL: https://github.com/apache/arrow/pull/10862#discussion_r682716498



##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -152,10 +154,261 @@ struct ScalarHashImpl {
   size_t hash_;
 };
 
+// Implementation of Scalar::Validate() and Scalar::ValidateFull()
+struct ScalarValidateImpl {
+  const bool full_validation_;
+
+  explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) {
+    ::arrow::util::InitializeUTF8();
+  }
+
+  Status Validate(const Scalar& scalar) {
+    if (!scalar.type) {
+      return Status::Invalid("scalar lacks a type");
+    }
+    return VisitScalarInline(scalar, this);
+  }
+
+  Status Visit(const NullScalar& s) {
+    if (s.is_valid) {
+      return Status::Invalid("null scalar should have is_valid = false");
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const internal::PrimitiveScalar<T>& s) {
+    return Status::OK();
+  }
+
+  Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); }
+
+  Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    RETURN_NOT_OK(ValidateBinaryScalar(s));
+    if (s.is_valid) {
+      const auto& byte_width =
+          checked_cast<const FixedSizeBinaryType&>(*s.type).byte_width();
+      if (s.value->size() != byte_width) {
+        return Status::Invalid(s.type->ToString(), " scalar should have a value of size ",
+                               byte_width, ", got ", s.value->size());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Scalar& s) {
+    // XXX validate precision?
+    return Status::OK();
+  }
+
+  Status Visit(const BaseListScalar& s) { return ValidateBaseListScalar(s); }
+
+  Status Visit(const FixedSizeListScalar& s) {
+    RETURN_NOT_OK(ValidateBaseListScalar(s));
+    if (s.is_valid) {
+      const auto& list_type = checked_cast<const FixedSizeListType&>(*s.type);
+      if (s.value->length() != list_type.list_size()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar should have a child value of length ",
+                               list_type.list_size(), ", got ", s.value->length());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructScalar& s) {
+    if (!s.is_valid) {
+      if (!s.value.empty()) {
+        return Status::Invalid(s.type->ToString(),
+                               " scalar is marked null but has child values");
+      }
+      return Status::OK();
+    }
+    const int num_fields = s.type->num_fields();
+    const auto& fields = s.type->fields();
+    if (fields.size() != s.value.size()) {
+      return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ",
+                             num_fields, " child values, got ", s.value.size());
+    }
+    for (int i = 0; i < num_fields; ++i) {
+      if (!s.value[i]) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has missing child value at index ", i);
+      }
+      const auto st = Validate(*s.value[i]);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for child at index ", i, ": ",
+                              st.message());
+      }
+      if (!s.value[i]->type->Equals(*fields[i]->type())) {
+        return Status::Invalid(
+            s.type->ToString(), " scalar should have a child value of type ",
+            fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryScalar& s) {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*s.type);
+    // Validate index
+    if (!s.value.index) {
+      return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value");
+    }
+    {
+      const auto st = Validate(*s.value.index);
+      if (!st.ok()) {
+        return st.WithMessage(s.type->ToString(),
+                              " scalar fails validation for index value: ", st.message());
+      }
+    }
+    if (!s.value.index->type->Equals(*dict_type.index_type())) {
+      return Status::Invalid(
+          s.type->ToString(), " scalar should have an index value of type ",
+          dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString());
+    }
+    if (s.is_valid != s.value.index->is_valid) {
+      if (s.is_valid) {
+        return Status::Invalid("non-null ", s.type->ToString(),
+                               " scalar has null index value");
+      } else {
+        return Status::Invalid("null ", s.type->ToString(),
+                               " scalar has non-null index value");
+      }
+    }
+
+    // Validate dictionary
+    if (!s.value.dictionary) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar doesn't have a dictionary value");
+    }
+    {
+      const auto st = full_validation_ ? s.value.dictionary->ValidateFull()
+                                       : s.value.dictionary->Validate();
+      if (!st.ok()) {
+        return st.WithMessage(
+            s.type->ToString(),
+            " scalar fails validation for dictionary value: ", st.message());
+      }
+    }
+    if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) {
+      return Status::Invalid(s.type->ToString(),
+                             " scalar should have a dictionary value of type ",
+                             dict_type.value_type()->ToString(), ", got ",
+                             s.value.dictionary->type()->ToString());
+    }
+
+    // XXX check index is in bounds?
+    return Status::OK();
+  }
+
+  Status Visit(const UnionScalar& s) {
+    RETURN_NOT_OK(ValidateOptionalValue(s));
+    const int type_code = s.type_code;  // avoid 8-bit int types for priting
+    const auto& union_type = checked_cast<const UnionType&>(*s.type);
+    const auto& child_ids = union_type.child_ids();
+    if (type_code < 0 || type_code >= static_cast<int64_t>(child_ids.size()) ||
+        child_ids[type_code] == UnionType::kInvalidChildId) {

Review comment:
       `child_ids` is a mapping of type codes to (physical) child ids. What you are thinking about is `type_codes`, which is a mapping of (physical) child ids to type codes.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org