You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/01 12:01:08 UTC

[GitHub] [arrow] alamb commented on a change in pull request #9840: ARROW-12107: [Rust][DataFusion] Support `SELECT * from information_schema.columns`

alamb commented on a change in pull request #9840:
URL: https://github.com/apache/arrow/pull/9840#discussion_r605598128



##########
File path: rust/datafusion/src/catalog/information_schema.rs
##########
@@ -221,3 +262,226 @@ impl InformationSchemaTablesBuilder {
         MemTable::try_new(schema, vec![vec![batch]]).unwrap()
     }
 }
+
+/// Builds the `information_schema.COLUMNS` table row by row
+///
+/// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html
+struct InformationSchemaColumnsBuilder {
+    catalog_names: StringBuilder,
+    schema_names: StringBuilder,
+    table_names: StringBuilder,
+    column_names: StringBuilder,
+    ordinal_positions: UInt64Builder,
+    column_defaults: StringBuilder,
+    is_nullables: StringBuilder,
+    data_types: StringBuilder,
+    character_maximum_lengths: UInt64Builder,
+    character_octet_lengths: UInt64Builder,
+    numeric_precisions: UInt64Builder,
+    numeric_precision_radixes: UInt64Builder,
+    numeric_scales: UInt64Builder,
+    datetime_precisions: UInt64Builder,
+    interval_types: StringBuilder,
+}
+
+impl InformationSchemaColumnsBuilder {
+    fn new() -> Self {
+        // StringBuilder requires providing an initial capacity, so
+        // pick 10 here arbitrarily as this is not performance
+        // critical code and the number of tables is unavailable here.
+        let default_capacity = 10;
+        Self {
+            catalog_names: StringBuilder::new(default_capacity),
+            schema_names: StringBuilder::new(default_capacity),
+            table_names: StringBuilder::new(default_capacity),
+            column_names: StringBuilder::new(default_capacity),
+            ordinal_positions: UInt64Builder::new(default_capacity),
+            column_defaults: StringBuilder::new(default_capacity),
+            is_nullables: StringBuilder::new(default_capacity),
+            data_types: StringBuilder::new(default_capacity),
+            character_maximum_lengths: UInt64Builder::new(default_capacity),
+            character_octet_lengths: UInt64Builder::new(default_capacity),
+            numeric_precisions: UInt64Builder::new(default_capacity),
+            numeric_precision_radixes: UInt64Builder::new(default_capacity),
+            numeric_scales: UInt64Builder::new(default_capacity),
+            datetime_precisions: UInt64Builder::new(default_capacity),
+            interval_types: StringBuilder::new(default_capacity),
+        }
+    }
+
+    fn add_column(
+        &mut self,
+        catalog_name: impl AsRef<str>,
+        schema_name: impl AsRef<str>,
+        table_name: impl AsRef<str>,
+        column_name: impl AsRef<str>,
+        column_position: usize,
+        is_nullable: bool,
+        data_type: &DataType,
+    ) {
+        use DataType::*;
+
+        // Note: append_value is actually infallable.
+        self.catalog_names
+            .append_value(catalog_name.as_ref())
+            .unwrap();
+        self.schema_names
+            .append_value(schema_name.as_ref())
+            .unwrap();
+        self.table_names.append_value(table_name.as_ref()).unwrap();
+
+        self.column_names
+            .append_value(column_name.as_ref())
+            .unwrap();
+
+        self.ordinal_positions
+            .append_value(column_position as u64)
+            .unwrap();
+
+        // DataFusion does not support column default values, so null
+        self.column_defaults.append_null().unwrap();
+
+        // "YES if the column is possibly nullable, NO if it is known not nullable. "
+        let nullable_str = if is_nullable { "YES" } else { "NO" };
+        self.is_nullables.append_value(nullable_str).unwrap();
+
+        // "System supplied type" --> Use debug format of the datatype
+        self.data_types
+            .append_value(format!("{:?}", data_type))
+            .unwrap();
+
+        // "If data_type identifies a character or bit string type, the
+        // declared maximum length; null for all other data types or
+        // if no maximum length was declared."
+        //
+        // Arrow has no equivalent of VARCHAR(20), so we leave this as Null
+        let max_chars = None;
+        self.character_maximum_lengths
+            .append_option(max_chars)
+            .unwrap();
+
+        // "Maximum length, in bytes, for binary data, character data,
+        // or text and image data."
+        let char_len: Option<u64> = match data_type {
+            Utf8 | Binary => Some(i32::MAX as u64),
+            LargeBinary | LargeUtf8 => Some(i64::MAX as u64),
+            _ => None,
+        };
+        self.character_octet_lengths
+            .append_option(char_len)
+            .unwrap();
+
+        // numeric_precision: "If data_type identifies a numeric type, this column
+        // contains the (declared or implicit) precision of the type
+        // for this column. The precision indicates the number of
+        // significant digits. It can be expressed in decimal (base
+        // 10) or binary (base 2) terms, as specified in the column
+        // numeric_precision_radix. For all other data types, this
+        // column is null."
+        //
+        // numeric_radix: If data_type identifies a numeric type, this
+        // column indicates in which base the values in the columns
+        // numeric_precision and numeric_scale are expressed. The
+        // value is either 2 or 10. For all other data types, this
+        // column is null.
+        //
+        // numeric_scale: If data_type identifies an exact numeric
+        // type, this column contains the (declared or implicit) scale
+        // of the type for this column. The scale indicates the number
+        // of significant digits to the right of the decimal point. It
+        // can be expressed in decimal (base 10) or binary (base 2)
+        // terms, as specified in the column
+        // numeric_precision_radix. For all other data types, this
+        // column is null.
+        let (numeric_precision, numeric_radix, numeric_scale) = match data_type {
+            Int8 | UInt8 => (Some(8), Some(2), None),
+            Int16 | UInt16 => (Some(16), Some(2), None),
+            Int32 | UInt32 => (Some(32), Some(2), None),
+            // From max value of 65504 as explained on
+            // https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding
+            Float16 => (Some(15), Some(2), None),
+            // Numbers from postgres `real` type
+            Float32 => (Some(24), Some(2), None),
+            // Numbers from postgres `double` type
+            Float64 => (Some(24), Some(2), None),
+            Decimal(precision, scale) => {
+                (Some(*precision as u64), Some(10), Some(*scale as u64))
+            }
+            _ => (None, None, None),
+        };
+
+        self.numeric_precisions
+            .append_option(numeric_precision)
+            .unwrap();
+        self.numeric_precision_radixes
+            .append_option(numeric_radix)
+            .unwrap();
+        self.numeric_scales.append_option(numeric_scale).unwrap();
+
+        self.datetime_precisions.append_option(None).unwrap();
+        self.interval_types.append_null().unwrap();
+    }
+
+    fn build(self) -> MemTable {

Review comment:
       (which I now realize I put into https://github.com/apache/arrow/pull/9866 rather than this PR 🤦 ) -- but it should get in that PR hopefully




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org