You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/26 12:55:03 UTC

[GitHub] [arrow] manojkarthick commented on a change in pull request #9306: ARROW-10297: [Rust] Parameter for parquet-read to output data in json format

manojkarthick commented on a change in pull request #9306:
URL: https://github.com/apache/arrow/pull/9306#discussion_r564488177



##########
File path: rust/parquet/src/record/api.rs
##########
@@ -635,6 +648,55 @@ impl Field {
             _ => nyi!(descr, value),
         }
     }
+
+    #[cfg(feature = "json_output")]
+    pub fn to_json_value(&self) -> Value {
+        match &self {
+            Field::Null => Value::Null,
+            Field::Bool(b) => Value::Bool(*b),
+            Field::Byte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Short(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Int(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Long(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UByte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UShort(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UInt(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::ULong(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Float(n) => serde_json::Number::from_f64(*n as f64)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Double(n) => serde_json::Number::from_f64(*n)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Decimal(n) => Value::String(convert_decimal_to_string(&n)),
+            Field::Str(s) => Value::String(s.to_owned()),
+            Field::Bytes(b) => Value::String(base64::encode(b.data())),
+            Field::Date(d) => Value::String(convert_date_to_string(*d)),
+            Field::TimestampMillis(ts) => {
+                Value::String(convert_timestamp_millis_to_string(*ts))
+            }
+            Field::TimestampMicros(ts) => {
+                Value::String(convert_timestamp_micros_to_string(*ts))
+            }
+            Field::Group(row) => row.to_json_value(),
+            Field::ListInternal(fields) => {
+                Value::Array(fields.elements.iter().map(|f| f.to_json_value()).collect())
+            }
+            Field::MapInternal(map) => Value::Object(
+                map.entries
+                    .iter()
+                    .map(|(key_field, value_field)| {
+                        let key_val = key_field.to_json_value();
+                        let key_str = key_val
+                            .as_str()
+                            .map(|s| s.to_owned())
+                            .unwrap_or_else(|| key_val.to_string());
+                        (key_str, value_field.to_json_value())
+                    })
+                    .collect(),
+            ),
+        }
+    }
 }

Review comment:
       Thank you - I will work on that.

##########
File path: rust/parquet/src/bin/parquet-read.rs
##########
@@ -34,39 +34,98 @@
 //! ```
 //!
 //! # Usage
-//!
 //! ```
-//! parquet-read <file-path> [num-records]
+//!  parquet-read <file-path> [num-records]
 //! ```
-//! where `file-path` is the path to a Parquet file and `num-records` is the optional
-//! numeric option that allows to specify number of records to read from a file.
-//! When not provided, all records are read.
+//!
+//! ## Flags
+//!     -h, --help       Prints help information
+//!     -j, --json       Print parquet file in JSON lines Format
+//!     -V, --version    Prints version information
+//!
+//! ## Args
+//!     <file-path>      Path to a parquet file
+//!     <num-records>    Number of records to read. When not provided, all records are read.
 //!
 //! Note that `parquet-read` reads full file schema, no projection or filtering is
 //! applied.
 
 extern crate parquet;
 
-use std::{env, fs::File, path::Path, process};
+use std::{env, fs::File, path::Path};
+
+use clap::{crate_authors, crate_version, App, Arg};
 
 use parquet::file::reader::{FileReader, SerializedFileReader};
+use parquet::record::Row;
 
 fn main() {
-    let args: Vec<String> = env::args().collect();
-    if args.len() != 2 && args.len() != 3 {
-        println!("Usage: parquet-read <file-path> [num-records]");
-        process::exit(1);
-    }
+    #[cfg(not(feature = "json_output"))]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records are read.",
+                ),
+        );
+
+    #[cfg(feature = "json_output")]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records are read.",
+                ),
+        )
+        .arg(

Review comment:
       I tried but that didn't work since `clap::App` doesn't implement the `Copy` trait. OTOH, if we move this into a CLI tools feature, I don't think we need to have a separate feature for json output. Thoughts?

##########
File path: rust/parquet/Cargo.toml
##########
@@ -43,6 +43,8 @@ chrono = "0.4"
 num-bigint = "0.3"
 arrow = { path = "../arrow", version = "4.0.0-SNAPSHOT", optional = true }
 base64 = { version = "0.12", optional = true }
+clap = "2.33.3"

Review comment:
       Thanks @alamb ! Just to confirm: Having one feature for command line tools that has both `clap` + `serde_json` as dependencies and when used without that feature, no binaries are produced. Is my understanding correct?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org