You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/07/04 10:10:46 UTC

[GitHub] [spark] MichaelChirico commented on a change in pull request #24888: [SPARK-28040][SPARK-28070][R] Write type object s3

MichaelChirico commented on a change in pull request #24888: [SPARK-28040][SPARK-28070][R] Write type object s3
URL: https://github.com/apache/spark/pull/24888#discussion_r300330382
 
 

 ##########
 File path: R/pkg/R/serialize.R
 ##########
 @@ -36,192 +36,246 @@
 # nolint end
 
 getSerdeType <- function(object) {
-  type <- class(object)[[1]]
-  if (is.atomic(object) & !is.raw(object) & length(object) > 1) {
+  type <- class(object)[[1L]]
+  if (is.atomic(object) & !is.raw(object) & length(object) > 1L) {
     "array"
   } else if (type != "list") {
      type
   } else {
     # Check if all elements are of same type
     elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
-    if (length(elemType) <= 1) {
+    if (length(elemType) <= 1L) {
       "array"
     } else {
       "list"
     }
   }
 }
 
-writeObject <- function(con, object, writeType = TRUE) {
-  # NOTE: In R vectors have same type as objects
-  type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
-  # Checking types is needed here, since 'is.na' only handles atomic vectors,
-  # lists and pairlists
-  if (type %in% c("integer", "character", "logical", "double", "numeric")) {
-    if (is.na(object)) {
-      object <- NULL
-      type <- "NULL"
-    }
-  }
+# NOTE: In R vectors have same type as objects
+writeObject <- function(object, con, writeType = TRUE) UseMethod("writeObject")
+writeObject.default <- function(object, con, writeType = TRUE) {
+  stop(paste("Unsupported type for serialization", class(object)))
+}
 
-  serdeType <- getSerdeType(object)
+# integer same as logical; will cast TRUE -> 1, FALSE -> 0
+writeObject.integer <-
+writeObject.logical <- function(object, con, writeType = TRUE) {
   if (writeType) {
-    writeType(con, serdeType)
+    writeType(object, con)
   }
-  switch(serdeType,
-         NULL = writeVoid(con),
-         integer = writeInt(con, object),
-         character = writeString(con, object),
-         logical = writeBoolean(con, object),
-         double = writeDouble(con, object),
-         numeric = writeDouble(con, object),
-         raw = writeRaw(con, object),
-         array = writeArray(con, object),
-         list = writeList(con, object),
-         struct = writeList(con, object),
-         jobj = writeJobj(con, object),
-         environment = writeEnv(con, object),
-         Date = writeDate(con, object),
-         POSIXlt = writeTime(con, object),
-         POSIXct = writeTime(con, object),
-         stop(paste("Unsupported type for serialization", type)))
-}
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeArray(object, con))
+  if (is.na(object)) return() # no value for NULL
 
-writeVoid <- function(con) {
-  # no value for NULL
+  writeBin(as.integer(object), con, endian = "big")
 }
-
-writeJobj <- function(con, value) {
-  if (!isValidJobj(value)) {
-    stop("invalid jobj ", value$id)
+writeObject.character <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
   }
-  writeString(con, value$id)
-}
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeArray(object, con))
+  if (is.na(object)) return() # no value for NULL
 
-writeString <- function(con, value) {
   utfVal <- enc2utf8(value)
-  writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1))
+  writeObject(as.integer(nchar(utfVal, type = "bytes") + 1L), writeType = FALSE)
   writeBin(utfVal, con, endian = "big", useBytes = TRUE)
 }
+writeObject.numeric <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeArray(object, con))
+  if (is.na(object)) return() # no value for NULL
 
-writeInt <- function(con, value) {
-  writeBin(as.integer(value), con, endian = "big")
+  writeBin(object, con, endian = "big")
 }
-
-writeDouble <- function(con, value) {
-  writeBin(value, con, endian = "big")
+writeObject.raw <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  writeObject(length(object), con, writeType = FALSE)
+  writeBin(object, con, endian = "big")
 }
-
-writeBoolean <- function(con, value) {
-  # TRUE becomes 1, FALSE becomes 0
-  writeInt(con, as.integer(value))
+writeObject.struct <-
+writeObject.list <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    # writeType.list either writes array or list; TRUE is returned for list
+    if (isTRUE(writeType(object, con))) {
+      writeObject(length(object), con, writeType = FALSE)
+      return(writeBin(object, con, endian = "big"))
+    } else {
+      return(writeArray(object, con))
+    }
+  }
+  # Check if all elements are of same type
+  elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
+  if (length(elemType) <= 1L) {
+    return(writeArray(object, con))
+  }
+  writeObject(length(object), con, writeType = FALSE)
+  writeBin(object, con, endian = "big")
+}
+writeObject.jobj <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  if (!isValidJobj(object)) {
+    stop("invalid jobj ", object$id)
+  }
+  writeObject(object$id, writeType = FALSE)
+}
+# Used to pass in hash maps required on Java side.
+writeObject.environment <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  len <- length(object)
+  writeObject(len, con, writeType = FALSE)
+  if (len > 0L) {
+    envObj <- ls(object)
+    writeArray(as.list(envObj), con)
+    writeObject(mget(envObj, object), writeType = FALSE)
+  }
+  writeBin(object, con, endian = "big")
+}
+writeObject.Date <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  writeObject(as.character(object), con, writeType = FALSE)
+}
+# covers POSIXct and POSIXt
+writeObject.POSIXt <- function(object, con, writeType = TRUE) {
+  if (writeType) {
+    writeType(object, con)
+  }
+  writeObject(as.double(object), con, writeType = FALSE)
 }
 
-writeRawSerialize <- function(outputCon, batch) {
+writeRawSerialize <- function(batch, outputCon) {
   outputSer <- serialize(batch, ascii = FALSE, connection = NULL)
-  writeRaw(outputCon, outputSer)
+  writeObject(outputSer, outputCon, writeType = FALSE)
 }
 
-writeRowSerialize <- function(outputCon, rows) {
+writeRowSerialize <- function(rows, outputCon) {
   invisible(lapply(rows, function(r) {
     bytes <- serializeRow(r)
-    writeRaw(outputCon, bytes)
+    writeObject(bytes, outputCon, writeType = FALSE)
   }))
 }
 
 serializeRow <- function(row) {
-  rawObj <- rawConnection(raw(0), "wb")
+  rawObj <- rawConnection(raw(0L), "wb")
   on.exit(close(rawObj))
-  writeList(rawObj, row)
+  writeObject(as.list(row), rawObj, writeType = FALSE)
   rawConnectionValue(rawObj)
 }
 
-writeRaw <- function(con, batch) {
-  writeInt(con, length(batch))
-  writeBin(batch, con, endian = "big")
-}
-
-writeType <- function(con, class) {
-  type <- switch(class,
-                 NULL = "n",
-                 integer = "i",
-                 character = "c",
-                 logical = "b",
-                 double = "d",
-                 numeric = "d",
-                 raw = "r",
-                 array = "a",
-                 list = "l",
-                 struct = "s",
-                 jobj = "j",
-                 environment = "e",
-                 Date = "D",
-                 POSIXlt = "t",
-                 POSIXct = "t",
-                 stop(paste("Unsupported type for serialization", class)))
-  writeBin(charToRaw(type), con)
-}
 
-# Used to pass arrays where all the elements are of the same type
-writeArray <- function(con, arr) {
-  # TODO: Empty lists are given type "character" right now.
-  # This may not work if the Java side expects array of any other type.
-  if (length(arr) == 0) {
-    elemType <- class("somestring")
-  } else {
-    elemType <- getSerdeType(arr[[1]])
-  }
+writeType <- function(object, con) UseMethod("writeType")
+writeType.default <- function(object, con) {
+  stop("Unsupported type for serialization", class(object))
+}
 
-  writeType(con, elemType)
-  writeInt(con, length(arr))
+# markers are written into con to signal incoming object
+#   type according to the following mapping:
+#        type marker  raw
+#        Date      D 0x44
+#       array      a 0x61
+#     logical      b 0x62
+#   character      c 0x63
+#     numeric      d 0x64
+# environment      e 0x65
+#     integer      i 0x69
+#        jobj      j 0x6a
+#        list      l 0x6c
+#        null      n 0x6e
+#         raw      r 0x72
+#      struct      s 0x73
+#      POSIXt      t 0x74
 
-  if (length(arr) > 0) {
-    for (a in arr) {
-      writeObject(con, a, FALSE)
-    }
-  }
+# 'is.na' only handles atomic vectors, lists and pairlists;
+#   all atomic classes except complex are handled; complex will error
+writeType.integer <- function(object, con) {
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeBin(as.raw(0x61), con))
+  if (is.na(object)) return(writeBin(as.raw(0x6e), con))
+  writeBin(as.raw(0x69), con)
 }
-
-# Used to pass arrays where the elements can be of different types
-writeList <- function(con, list) {
-  writeInt(con, length(list))
-  for (elem in list) {
-    writeObject(con, elem)
-  }
+writeType.character <- function(object, con) {
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeBin(as.raw(0x61), con))
+  if (is.na(object)) return(writeBin(as.raw(0x6e), con))
+  writeBin(as.raw(0x63), con)
 }
-
-# Used to pass in hash maps required on Java side.
-writeEnv <- function(con, env) {
-  len <- length(env)
-
-  writeInt(con, len)
-  if (len > 0) {
-    writeArray(con, as.list(ls(env)))
-    vals <- lapply(ls(env), function(x) { env[[x]] })
-    writeList(con, as.list(vals))
+writeType.logical <- function(object, con) {
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeBin(as.raw(0x61), con))
+  if (is.na(object)) return(writeBin(as.raw(0x6e), con))
+  writeBin(as.raw(0x62), con)
+}
+writeType.numeric <- function(object, con) {
+  # non-scalar value written as array
+  if (length(object) > 1L) return(writeBin(as.raw(0x61), con))
+  if (is.na(object)) return(writeBin(as.raw(0x6e), con))
+  writeBin(as.raw(0x64), con)
+}
+writeType.raw <- function(object, con) {
+  writeBin(as.raw(0x72), con)
+}
+writeType.list <- function(object, con) {
+  # Check if all elements are of same type
+  elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
+  if (length(elemType) <= 1L) {
+    return(writeBin(as.raw(0x61), con))
   }
+  writeBin(as.raw(0x6c), con)
+  # emit TRUE to signal that this object is being treated as a list
+  return(TRUE)
 }
-
-writeDate <- function(con, date) {
-  writeString(con, as.character(date))
+writeType.struct <- function(object, con) {
+  writeBin(as.raw(0x73), con)
+}
+writeType.jobj <- function(object, con) {
+  writeBin(as.raw(0x6a), con)
+}
+writeType.environment <- function(object, con) {
+  writeBin(as.raw(0x65), con)
+}
+writeType.Date <- function(object, con) {
+  writeBin(as.raw(0x44), con)
+}
+# covers POSIXct and POSIXt
+writeType.POSIXt <- function(object, con) {
+  writeBin(as.raw(0x74), con)
 }
 
-writeTime <- function(con, time) {
-  writeDouble(con, as.double(time))
+# Used to pass arrays where all the elements are of the same type
+writeArray <- function(arr, con) {
+  # TODO: Empty lists are given type "character" right now.
+  # This may not work if the Java side expects array of any other type.
+  writeType(if (length(arr) > 0L) arr[[1L]] else "somestring", con)
+  writeObject(length(arr), con, writeType = FALSE)
 
 Review comment:
   That's funny that there's a typo there. Didn't show up in either the `r-source` or my spark source.... investigating anyway

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org