You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2020/02/09 16:59:09 UTC

[arrow] branch master updated: ARROW-7662: [R] Support creating ListArray from R list

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new bf1653e  ARROW-7662: [R] Support creating ListArray from R list
bf1653e is described below

commit bf1653e4cf604a52ea596b2ab5bcf0f7f17e6586
Author: Michael Chirico <mi...@gmail.com>
AuthorDate: Sun Feb 9 08:58:45 2020 -0800

    ARROW-7662: [R] Support creating ListArray from R list
    
    Closes #6275 from MichaelChirico/r-parquet-array and squashes the following commits:
    
    2914c50c6 <Neal Richardson> Merge branch 'master' into r-parquet-array
    a8f26a198 <François Saint-Jacques> Address comments
    fc75d8631 <François Saint-Jacques> Implement R's vector to arrow::Array conversion
    a3a406b31 <Michael Chirico> half step forward
    93a0b9055 <Michael Chirico> progress (?) by mimicking MakeStringArray
    6d172df53 <Michael Chirico> more intermediate
    45663c250 <Michael Chirico> initial attempt at CheckCompatibleList
    cacd79904 <Michael Chirico> skeleton of way forward
    9cdd2fe27 <Michael Chirico> linting
    62cc75c5c <Michael Chirico> linting again
    331fa3d1e <Michael Chirico> linting
    761d083f3 <Michael Chirico> Merge branch 'master' into r-parquet-array
    7e1d33133 <Michael Chirico> Merge branch 'r-parquet-array' of github.com:MichaelChirico/arrow into r-parquet-array
    1de3dc4eb <Michael Chirico> fix typos
    35ae9c885 <Michael Chirico> Update r/NEWS.md
    c23478802 <Michael Chirico> initial foray into adding list column support for parquet writing
    
    Lead-authored-by: Michael Chirico <mi...@gmail.com>
    Co-authored-by: Michael Chirico <mi...@grabtaxi.com>
    Co-authored-by: François Saint-Jacques <fs...@gmail.com>
    Co-authored-by: Neal Richardson <ne...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/NEWS.md                          |   1 +
 r/src/array_from_vector.cpp        | 404 +++++++++++++++++++++++++------------
 r/src/arrow_types.h                |   5 +-
 r/src/expression.cpp               |   2 +-
 r/tests/testthat/test-Array.R      |  63 +++++-
 r/tests/testthat/test-array-data.R |  11 +-
 r/tests/testthat/test-parquet.R    |  15 ++
 7 files changed, 368 insertions(+), 133 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 06b7bc0..ff7f663 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -23,6 +23,7 @@
 * `write_feather`, `write_arrow` and `write_parquet` now return their input
 similar to `write_*` functions from `readr` (#6387, @boshek)
 * Dataset filtering is now correctly supported for all Arrow date/time/timestamp column types.
+* Can now infer the type of an R `list` and create a ListArray when all list elements are the same type (#6275, @michaelchirico)
 
 # arrow 0.16.0
 
diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp
index 02a3403..f794e19 100644
--- a/r/src/array_from_vector.cpp
+++ b/r/src/array_from_vector.cpp
@@ -15,9 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <memory>
+
 #include "./arrow_types.h"
+
 #if defined(ARROW_R_WITH_ARROW)
 
+using arrow::internal::checked_cast;
+
 namespace arrow {
 namespace r {
 
@@ -41,75 +46,179 @@ inline bool is_na<int>(int value) {
   return value == NA_INTEGER;
 }
 
-std::shared_ptr<Array> MakeStringArray(Rcpp::StringVector_ vec) {
-  R_xlen_t n = vec.size();
+std::shared_ptr<arrow::DataType> InferArrowType(SEXP x);
 
-  std::shared_ptr<Buffer> null_buffer;
-  std::shared_ptr<Buffer> offset_buffer;
-  std::shared_ptr<Buffer> value_buffer;
+struct VectorToArrayConverter {
+  Status Visit(const arrow::NullType& type) {
+    auto* null_builder = checked_cast<NullBuilder*>(builder);
+    return null_builder->AppendNulls(XLENGTH(x));
+  }
 
-  // there is always an offset buffer
-  STOP_IF_NOT_OK(AllocateBuffer((n + 1) * sizeof(int32_t), &offset_buffer));
+  Status Visit(const arrow::BooleanType& type) {
+    ARROW_RETURN_IF(TYPEOF(x) != LGLSXP, Status::RError("Expecting a logical vector"));
+    R_xlen_t n = XLENGTH(x);
 
-  R_xlen_t i = 0;
-  int current_offset = 0;
-  int64_t null_count = 0;
-  auto p_offset = reinterpret_cast<int32_t*>(offset_buffer->mutable_data());
-  *p_offset = 0;
-  for (++p_offset; i < n; i++, ++p_offset) {
-    SEXP s = STRING_ELT(vec, i);
-    if (s == NA_STRING) {
-      // break as we are going to need a null_bitmap buffer
-      break;
+    auto* bool_builder = checked_cast<BooleanBuilder*>(builder);
+    auto* p = LOGICAL(x);
+
+    RETURN_NOT_OK(bool_builder->Reserve(n));
+    for (R_xlen_t i = 0; i < n; i++) {
+      auto value = p[i];
+      if (value == NA_LOGICAL) {
+        bool_builder->UnsafeAppendNull();
+      } else {
+        bool_builder->UnsafeAppend(value == 1);
+      }
     }
+    return Status::OK();
+  }
+
+  Status Visit(const arrow::Int32Type& type) {
+    ARROW_RETURN_IF(TYPEOF(x) != INTSXP, Status::RError("Expecting an integer vector"));
 
-    *p_offset = current_offset += LENGTH(s);
+    auto* int_builder = checked_cast<Int32Builder*>(builder);
+
+    R_xlen_t n = XLENGTH(x);
+    const auto* data = INTEGER(x);
+
+    RETURN_NOT_OK(int_builder->Reserve(n));
+    for (R_xlen_t i = 0; i < n; i++) {
+      const auto value = data[i];
+      if (value == NA_INTEGER) {
+        int_builder->UnsafeAppendNull();
+      } else {
+        int_builder->UnsafeAppend(value);
+      }
+    }
+
+    return Status::OK();
   }
 
-  if (i < n) {
-    STOP_IF_NOT_OK(AllocateBuffer(BitUtil::BytesForBits(n), &null_buffer));
-    internal::FirstTimeBitmapWriter null_bitmap_writer(null_buffer->mutable_data(), 0, n);
+  Status Visit(const arrow::Int64Type& type) {
+    ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector"));
+    ARROW_RETURN_IF(Rf_inherits(x, "integer64"),
+                    Status::RError("Expecting a vector that inherits integer64"));
 
-    // catch up
-    for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
-      null_bitmap_writer.Set();
+    auto* int_builder = checked_cast<Int64Builder*>(builder);
+
+    R_xlen_t n = XLENGTH(x);
+    const auto* data = (REAL(x));
+
+    RETURN_NOT_OK(int_builder->Reserve(n));
+    for (R_xlen_t i = 0; i < n; i++) {
+      const auto value = arrow::util::SafeCopy<int64_t>(data[i]);
+      if (value == NA_INT64) {
+        int_builder->UnsafeAppendNull();
+      } else {
+        int_builder->UnsafeAppend(value);
+      }
     }
 
-    // resume offset filling
-    for (; i < n; i++, ++p_offset, null_bitmap_writer.Next()) {
-      SEXP s = STRING_ELT(vec, i);
-      if (s == NA_STRING) {
-        null_bitmap_writer.Clear();
-        *p_offset = current_offset;
-        null_count++;
+    return Status::OK();
+  }
+
+  Status Visit(const arrow::DoubleType& type) {
+    ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector"));
+
+    auto* double_builder = checked_cast<DoubleBuilder*>(builder);
+
+    R_xlen_t n = XLENGTH(x);
+    const auto* data = (REAL(x));
+
+    RETURN_NOT_OK(double_builder->Reserve(n));
+    for (R_xlen_t i = 0; i < n; i++) {
+      const auto value = data[i];
+      if (ISNA(value)) {
+        double_builder->UnsafeAppendNull();
       } else {
-        null_bitmap_writer.Set();
-        *p_offset = current_offset += LENGTH(s);
+        double_builder->UnsafeAppend(value);
       }
     }
 
-    null_bitmap_writer.Finish();
+    return Status::OK();
   }
 
-  // ----- data buffer
-  if (current_offset > 0) {
-    STOP_IF_NOT_OK(AllocateBuffer(current_offset, &value_buffer));
-    p_offset = reinterpret_cast<int32_t*>(offset_buffer->mutable_data());
-    auto p_data = reinterpret_cast<char*>(value_buffer->mutable_data());
+  template <typename T>
+  arrow::enable_if_base_binary<T, Status> Visit(const T& type) {
+    using BuilderType = typename TypeTraits<T>::BuilderType;
+
+    ARROW_RETURN_IF(TYPEOF(x) != STRSXP, Status::RError("Expecting a character vector"));
 
+    auto* binary_builder = checked_cast<BuilderType*>(builder);
+
+    R_xlen_t n = XLENGTH(x);
+    RETURN_NOT_OK(builder->Reserve(n));
     for (R_xlen_t i = 0; i < n; i++) {
-      SEXP s = STRING_ELT(vec, i);
-      if (s != NA_STRING) {
-        auto ni = LENGTH(s);
-        std::copy_n(CHAR(s), ni, p_data);
-        p_data += ni;
+      SEXP s = STRING_ELT(x, i);
+      if (s == NA_STRING) {
+        RETURN_NOT_OK(binary_builder->AppendNull());
+        continue;
       }
+
+      RETURN_NOT_OK(binary_builder->Append(CHAR(s), LENGTH(s)));
     }
+
+    return Status::OK();
   }
 
-  auto data = ArrayData::Make(arrow::utf8(), n,
-                              {null_buffer, offset_buffer, value_buffer}, null_count, 0);
-  return MakeArray(data);
+  template <typename T>
+  arrow::enable_if_base_list<T, Status> Visit(const T& type) {
+    using BuilderType = typename TypeTraits<T>::BuilderType;
+
+    ARROW_RETURN_IF(TYPEOF(x) != VECSXP, Status::RError("Expecting a list vector"));
+
+    auto* list_builder = checked_cast<BuilderType*>(builder);
+    auto* value_builder = list_builder->value_builder();
+    auto value_type = type.value_type();
+
+    R_xlen_t n = XLENGTH(x);
+    RETURN_NOT_OK(builder->Reserve(n));
+    for (R_xlen_t i = 0; i < n; i++) {
+      SEXP vector = VECTOR_ELT(x, i);
+      if (vector == R_NilValue) {
+        list_builder->AppendNull();
+        continue;
+      }
+
+      list_builder->Append();
+
+      auto vect_type = arrow::r::InferArrowType(vector);
+      if (!value_type->Equals(vect_type)) {
+        return Status::RError("List vector expecting elements vector of type ",
+                              value_type->ToString(), " but got ", vect_type->ToString());
+      }
+
+      // Recurse.
+      VectorToArrayConverter converter{vector, value_builder};
+      RETURN_NOT_OK(arrow::VisitTypeInline(*value_type, &converter));
+    }
+
+    return Status::OK();
+  }
+
+  Status Visit(const arrow::DataType& type) {
+    return Status::NotImplemented("Converting vector to arrow type ", type.ToString(),
+                                  " not implemented");
+  }
+
+  static std::shared_ptr<Array> Visit(SEXP x, const std::shared_ptr<DataType>& type) {
+    std::unique_ptr<ArrayBuilder> builder;
+    STOP_IF_NOT_OK(MakeBuilder(arrow::default_memory_pool(), type, &builder));
+
+    VectorToArrayConverter converter{x, builder.get()};
+    STOP_IF_NOT_OK(arrow::VisitTypeInline(*type, &converter));
+
+    std::shared_ptr<Array> result;
+    STOP_IF_NOT_OK(builder->Finish(&result));
+    return result;
+  }
+
+  SEXP x;
+  arrow::ArrayBuilder* builder;
+};
+
+std::shared_ptr<Array> MakeStringArray(SEXP x, const std::shared_ptr<DataType>& type) {
+  return VectorToArrayConverter::Visit(x, type);
 }
 
 template <typename Type>
@@ -163,7 +272,7 @@ std::shared_ptr<Array> MakeFactorArrayImpl(Rcpp::IntegerVector_ factor,
   auto array_indices = MakeArray(array_indices_data);
 
   SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
-  auto dict = MakeStringArray(levels);
+  auto dict = MakeStringArray(levels, utf8());
 
   std::shared_ptr<Array> out;
   STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, dict, &out));
@@ -192,6 +301,10 @@ std::shared_ptr<Array> MakeStructArray(SEXP df, const std::shared_ptr<DataType>&
   return std::make_shared<StructArray>(type, children[0]->length(), children);
 }
 
+std::shared_ptr<Array> MakeListArray(SEXP x, const std::shared_ptr<DataType>& type) {
+  return VectorToArrayConverter::Visit(x, type);
+}
+
 template <typename T>
 int64_t time_cast(T value);
 
@@ -211,7 +324,6 @@ inline int64_t time_cast<double>(double value) {
 // ---------------- new api
 
 namespace arrow {
-using internal::checked_cast;
 
 namespace internal {
 
@@ -797,84 +909,118 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
   return Status::NotImplemented("type not implemented");
 }
 
-template <typename Type>
-std::shared_ptr<arrow::DataType> GetFactorTypeImpl(bool ordered) {
-  return dictionary(std::make_shared<Type>(), arrow::utf8(), ordered);
+static inline std::shared_ptr<arrow::DataType> IndexTypeForFactors(int n_factors) {
+  if (n_factors < INT8_MAX) {
+    return arrow::int8();
+  } else if (n_factors < INT16_MAX) {
+    return arrow::int16();
+  } else {
+    return arrow::int32();
+  }
 }
 
-std::shared_ptr<arrow::DataType> GetFactorType(SEXP factor) {
-  SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
+std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP factor) {
+  SEXP factors = Rf_getAttrib(factor, R_LevelsSymbol);
+  auto index_type = IndexTypeForFactors(Rf_length(factors));
   bool is_ordered = Rf_inherits(factor, "ordered");
-  int n = Rf_length(levels);
-  if (n < 128) {
-    return GetFactorTypeImpl<arrow::Int8Type>(is_ordered);
-  } else if (n < 32768) {
-    return GetFactorTypeImpl<arrow::Int16Type>(is_ordered);
+  return dictionary(index_type, arrow::utf8(), is_ordered);
+}
+
+template <int VectorType>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector(SEXP x) {
+  Rcpp::stop("Unknown vector type: ", VectorType);
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<ENVSXP>(SEXP x) {
+  if (Rf_inherits(x, "Array")) {
+    Rcpp::ConstReferenceSmartPtrInputParameter<std::shared_ptr<arrow::Array>> array(x);
+    return static_cast<std::shared_ptr<arrow::Array>>(array)->type();
+  }
+
+  Rcpp::stop("Unrecognized vector instance for type ENVSXP");
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<LGLSXP>(SEXP x) {
+  return Rf_inherits(x, "vctrs_unspecified") ? null() : boolean();
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<INTSXP>(SEXP x) {
+  if (Rf_isFactor(x)) {
+    return InferArrowTypeFromFactor(x);
+  } else if (Rf_inherits(x, "Date")) {
+    return date32();
+  } else if (Rf_inherits(x, "POSIXct")) {
+    return timestamp(TimeUnit::MICRO, "GMT");
+  }
+  return int32();
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<REALSXP>(SEXP x) {
+  if (Rf_inherits(x, "Date")) {
+    return date32();
+  }
+  if (Rf_inherits(x, "POSIXct")) {
+    return timestamp(TimeUnit::MICRO, "GMT");
+  }
+  if (Rf_inherits(x, "integer64")) {
+    return int64();
+  }
+  if (Rf_inherits(x, "difftime")) {
+    return time32(TimeUnit::SECOND);
+  }
+  return float64();
+}
+
+static inline std::shared_ptr<arrow::DataType> InferArrowTypeFromDataFrame(SEXP x) {
+  R_xlen_t n = XLENGTH(x);
+  SEXP names = Rf_getAttrib(x, R_NamesSymbol);
+  std::vector<std::shared_ptr<arrow::Field>> fields(n);
+  for (R_xlen_t i = 0; i < n; i++) {
+    const auto* field_name = CHAR(STRING_ELT(names, i));
+    fields[i] = arrow::field(field_name, InferArrowType(VECTOR_ELT(x, i)));
+  }
+  return arrow::struct_(std::move(fields));
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
+  if (Rf_inherits(x, "data.frame")) {
+    return InferArrowTypeFromDataFrame(x);
   } else {
-    return GetFactorTypeImpl<arrow::Int32Type>(is_ordered);
+    if (XLENGTH(x) == 0) {
+      Rcpp::stop(
+          "Requires at least one element to infer the values' type of a list vector");
+    }
+
+    return arrow::list(InferArrowType(VECTOR_ELT(x, 0)));
   }
 }
 
-std::shared_ptr<arrow::DataType> InferType(SEXP x) {
+std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
   switch (TYPEOF(x)) {
     case ENVSXP:
-      if (Rf_inherits(x, "Array")) {
-        Rcpp::ConstReferenceSmartPtrInputParameter<std::shared_ptr<arrow::Array>> array(
-            x);
-        return static_cast<std::shared_ptr<arrow::Array>>(array)->type();
-      }
-      break;
+      return InferArrowTypeFromVector<ENVSXP>(x);
     case LGLSXP:
-      if (Rf_inherits(x, "vctrs_unspecified")) {
-        return null();
-      }
-      return boolean();
+      return InferArrowTypeFromVector<LGLSXP>(x);
     case INTSXP:
-      if (Rf_isFactor(x)) {
-        return GetFactorType(x);
-      }
-      if (Rf_inherits(x, "Date")) {
-        return date32();
-      }
-      if (Rf_inherits(x, "POSIXct")) {
-        return timestamp(TimeUnit::MICRO, "GMT");
-      }
-      return int32();
+      return InferArrowTypeFromVector<INTSXP>(x);
     case REALSXP:
-      if (Rf_inherits(x, "Date")) {
-        return date32();
-      }
-      if (Rf_inherits(x, "POSIXct")) {
-        return timestamp(TimeUnit::MICRO, "GMT");
-      }
-      if (Rf_inherits(x, "integer64")) {
-        return int64();
-      }
-      if (Rf_inherits(x, "difftime")) {
-        return time32(TimeUnit::SECOND);
-      }
-      return float64();
+      return InferArrowTypeFromVector<REALSXP>(x);
     case RAWSXP:
       return int8();
     case STRSXP:
       return utf8();
     case VECSXP:
-      if (Rf_inherits(x, "data.frame")) {
-        R_xlen_t n = XLENGTH(x);
-        SEXP names = Rf_getAttrib(x, R_NamesSymbol);
-        std::vector<std::shared_ptr<arrow::Field>> fields(n);
-        for (R_xlen_t i = 0; i < n; i++) {
-          fields[i] = std::make_shared<arrow::Field>(CHAR(STRING_ELT(names, i)),
-                                                     InferType(VECTOR_ELT(x, i)));
-        }
-        return std::make_shared<StructType>(std::move(fields));
-      }
-      break;
+      return InferArrowTypeFromVector<VECSXP>(x);
     default:
       break;
   }
 
-  Rcpp::stop("cannot infer type from data");
+  Rcpp::stop("Cannot infer type from vector");
 }
 
 // in some situations we can just use the memory of the R object in an RBuffer
@@ -945,29 +1091,28 @@ std::shared_ptr<Array> MakeSimpleArray(SEXP x) {
 }
 
 std::shared_ptr<arrow::Array> Array__from_vector_reuse_memory(SEXP x) {
-  switch (TYPEOF(x)) {
-    case INTSXP:
-      return MakeSimpleArray<INTSXP, Int32Type>(x);
-    case REALSXP:
-      if (Rf_inherits(x, "integer64")) {
-        return MakeSimpleArray<REALSXP, Int64Type>(x);
-      }
-      return MakeSimpleArray<REALSXP, DoubleType>(x);
-    case RAWSXP:
-      return MakeSimpleArray<RAWSXP, UInt8Type>(x);
-    default:
-      break;
+  auto type = TYPEOF(x);
+
+  if (type == INTSXP) {
+    return MakeSimpleArray<INTSXP, Int32Type>(x);
+  } else if (type == REALSXP && Rf_inherits(x, "integer64")) {
+    return MakeSimpleArray<REALSXP, Int64Type>(x);
+  } else if (type == REALSXP) {
+    return MakeSimpleArray<REALSXP, DoubleType>(x);
+  } else if (type == RAWSXP) {
+    return MakeSimpleArray<RAWSXP, UInt8Type>(x);
   }
 
-  Rcpp::stop("not implemented");
+  Rcpp::stop("Unreachable: you might need to fix can_reuse_memory()");
 }
 
 bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr<arrow::DataType>& type) {
-  if (!Rf_inherits(obj, "factor")) return false;
+  if (!Rf_inherits(obj, "factor")) {
+    return false;
+  }
 
-  arrow::DictionaryType* dict_type =
-      arrow::checked_cast<arrow::DictionaryType*>(type.get());
-  return dict_type->value_type() == utf8();
+  auto* dict_type = checked_cast<arrow::DictionaryType*>(type.get());
+  return dict_type->value_type()->Equals(utf8());
 }
 
 arrow::Status CheckCompatibleStruct(SEXP obj,
@@ -1016,8 +1161,7 @@ std::shared_ptr<arrow::Array> Array__from_vector(
 
   // treat strings separately for now
   if (type->id() == Type::STRING) {
-    STOP_IF_NOT(TYPEOF(x) == STRSXP, "Cannot convert R object to string array");
-    return arrow::r::MakeStringArray(x);
+    return arrow::r::MakeStringArray(x, type);
   }
 
   // factors only when type has been inferred
@@ -1029,6 +1173,10 @@ std::shared_ptr<arrow::Array> Array__from_vector(
     Rcpp::stop("Object incompatible with dictionary type");
   }
 
+  if (type->id() == Type::LIST) {
+    return arrow::r::MakeListArray(x, type);
+  }
+
   // struct types
   if (type->id() == Type::STRUCT) {
     if (!type_inferred) {
@@ -1060,7 +1208,7 @@ std::shared_ptr<arrow::Array> Array__from_vector(
 
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> Array__infer_type(SEXP x) {
-  return arrow::r::InferType(x);
+  return arrow::r::InferArrowType(x);
 }
 
 // [[arrow::export]]
@@ -1070,7 +1218,7 @@ std::shared_ptr<arrow::Array> Array__from_vector(SEXP x, SEXP s_type) {
   bool type_inferred = Rf_isNull(s_type);
   std::shared_ptr<arrow::DataType> type;
   if (type_inferred) {
-    type = arrow::r::InferType(x);
+    type = arrow::r::InferArrowType(x);
   } else {
     type = arrow::r::extract<arrow::DataType>(s_type);
   }
@@ -1093,7 +1241,7 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(Rcpp::List chunks,
     if (n == 0) {
       Rcpp::stop("type must be specified for empty list");
     }
-    type = arrow::r::InferType(VECTOR_ELT(chunks, 0));
+    type = arrow::r::InferArrowType(VECTOR_ELT(chunks, 0));
   } else {
     type = arrow::r::extract<arrow::DataType>(s_type);
   }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 86f74c9..5facffa 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -210,7 +210,10 @@ inline std::shared_ptr<T> extract(SEXP x) {
 #include <arrow/json/reader.h>
 #include <arrow/result.h>
 #include <arrow/type.h>
+#include <arrow/util/checked_cast.h>
 #include <arrow/util/compression.h>
+#include <arrow/util/ubsan.h>
+#include <arrow/visitor_inline.h>
 #include <parquet/arrow/reader.h>
 #include <parquet/arrow/writer.h>
 #include <parquet/exception.h>
@@ -284,7 +287,7 @@ class RBuffer : public MutableBuffer {
   Vec vec_;
 };
 
-std::shared_ptr<arrow::DataType> GetFactorType(SEXP);
+std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP);
 
 }  // namespace r
 }  // namespace arrow
diff --git a/r/src/expression.cpp b/r/src/expression.cpp
index 2615e21..d63aab8 100644
--- a/r/src/expression.cpp
+++ b/r/src/expression.cpp
@@ -142,7 +142,7 @@ std::shared_ptr<ds::ScalarExpression> dataset___expr__scalar(SEXP x) {
     case INTSXP:
       if (Rf_inherits(x, "factor")) {
         // TODO: This does not use the actual value, just the levels
-        auto type = arrow::r::GetFactorType(x);
+        auto type = arrow::r::InferArrowTypeFromFactor(x);
         return ds::scalar(std::make_shared<arrow::DictionaryScalar>(type));
       }
       return ds::scalar(Rf_asInteger(x));
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index e379186..c8252d6 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -456,7 +456,68 @@ test_that("Array$create() can handle data frame with custom struct type (not inf
   expect_error(Array$create(df, type = type), regexp = "Field name in position.*does not match the name of the column of the data frame")
 
   type <- struct(x = float64(), y = utf8())
-  expect_error(Array$create(df, type = type), regexp = "Cannot convert R object to string array")
+  expect_error(Array$create(df, type = type), regexp = "Expecting a character vector")
+})
+
+test_that("Array$create() handles vector -> list arrays (ARROW-7662)", {
+  expect_list_array <- function(v, type) {
+    a <- Array$create(v)
+    expect_equal(a$type, list_of(type))
+    expect_equivalent(a$as_vector(), v)
+  }
+
+  # Should be able to create an empty list with a type hint.
+  Array$create(list(), list_of(bool()))
+
+  # logical
+  expect_list_array(list(NA), bool())
+  expect_list_array(list(logical(0)), bool())
+  expect_list_array(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), bool())
+  expect_list_array(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), bool())
+
+  # integer
+  expect_list_array(list(NA_integer_), int32())
+  expect_list_array(list(integer(0)), int32())
+  expect_list_array(list(1:2, 3:4, 12:18), int32())
+  expect_list_array(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), int32())
+
+  # numeric
+  expect_list_array(list(NA_real_), float64())
+  expect_list_array(list(numeric(0)), float64())
+  expect_list_array(list(1, c(2, 3), 4), float64())
+  expect_list_array(list(1, numeric(0), c(2, 3, NA_real_), 4), float64())
+
+  # character
+  expect_list_array(list(NA_character_), utf8())
+  expect_list_array(list(character(0)), utf8())
+  expect_list_array(list("itsy", c("bitsy", "spider"), c("is")), utf8())
+  expect_list_array(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), utf8())
+})
+
+test_that("Array$create() should have helpful error on lists with type hint", {
+  expect_error(Array$create(list(numeric(0)), list_of(bool())),
+               regexp = "List vector expecting elements vector of type")
+  expect_error(Array$create(list(numeric(0)), list_of(int32())),
+               regexp = "List vector expecting elements vector of type")
+  expect_error(Array$create(list(integer(0)), list_of(float64())),
+               regexp = "List vector expecting elements vector of type")
+})
+
+test_that("Array$create() should refuse heterogeneous lists", {
+  lgl <- logical(0)
+  int <- integer(0)
+  num <- numeric(0)
+  char <- character(0)
+
+  expect_error(Array$create(list()),
+               regexp = "Requires at least one element to infer the values'")
+
+  expect_error(Array$create(list(lgl, lgl, int)),
+               regexp = "List vector expecting elements vector of type")
+  expect_error(Array$create(list(char, num, char)),
+               regexp = "List vector expecting elements vector of type")
+  expect_error(Array$create(list(int, int, num)),
+               regexp = "List vector expecting elements vector of type")
 })
 
 test_that("Array$View() (ARROW-6542)", {
diff --git a/r/tests/testthat/test-array-data.R b/r/tests/testthat/test-array-data.R
index 7890482..881706c 100644
--- a/r/tests/testthat/test-array-data.R
+++ b/r/tests/testthat/test-array-data.R
@@ -22,7 +22,14 @@ test_that("string vectors with only empty strings and nulls don't allocate a dat
   expect_equal(a$length(), 1L)
 
   buffers <- a$data()$buffers
-  expect_null(buffers[[1]])
-  expect_null(buffers[[3]])
+
+  # No nulls
+  expect_equal(buffers[[1]]$size, 1)
+
+  # Offsets has 2 elements
   expect_equal(buffers[[2]]$size, 8L)
+
+  # As per ARROW-2744, values buffer should preferably be non-null.
+  expect_equal(buffers[[3]]$size, 0L)
+  expect_equal(buffers[[3]]$capacity, 0L)
 })
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index d05602d..d46eb5e 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -121,6 +121,21 @@ test_that("Factors are preserved when writing/reading from Parquet", {
   expect_identical(df, df_read)
 })
 
+test_that("Lists are preserved when writing/reading from Parquet", {
+  bool <- list(logical(0), NA, c(TRUE, FALSE))
+  int <- list(integer(0), NA_integer_, 1:4)
+  num <- list(numeric(0), NA_real_, c(1, 2))
+  char <- list(character(0), NA_character_, c("itsy", "bitsy"))
+  df <- tibble::tibble(bool = bool, int = int, num = num, char = char)
+
+  pq_tmp_file <- tempfile()
+  on.exit(unlink(pq_tmp_file))
+
+  write_parquet(df, pq_tmp_file)
+  df_read <- read_parquet(pq_tmp_file)
+  expect_equivalent(df, df_read)
+})
+
 test_that("write_parquet() to stream", {
   df <- tibble::tibble(x = 1:5)
   tf <- tempfile()