You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/10/12 20:42:25 UTC

[arrow-datafusion] branch master updated: Improve doc on lowercase treatment of columns on SQL (#3385)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new d5e6736dd Improve doc on lowercase treatment of columns on SQL (#3385)
d5e6736dd is described below

commit d5e6736dd286c0c0374bb0f65b88d3564155c976
Author: nbr <na...@users.noreply.github.com>
AuthorDate: Wed Oct 12 22:42:20 2022 +0200

    Improve doc on lowercase treatment of columns on SQL (#3385)
    
    * Improve doc
    
    Adds specific note on lowercase treatment of columns on SQL
    
    This example is the first thing newcomers see when they start with datafusion (literally the second page on the docs) so it should be clearer.
    
    * Update select.md
    
    add note on the use of default lower-case for SQL query
    
    * prettier this
    
    * improve example file link
    
    * add capitalized example
---
 datafusion/core/tests/capitalized_example.csv |  5 +++++
 docs/source/user-guide/example-usage.md       | 17 +++++++++++------
 docs/source/user-guide/sql/select.md          |  3 +++
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/datafusion/core/tests/capitalized_example.csv b/datafusion/core/tests/capitalized_example.csv
new file mode 100644
index 000000000..dbc8f5c5a
--- /dev/null
+++ b/datafusion/core/tests/capitalized_example.csv
@@ -0,0 +1,5 @@
+A,b,c
+1,2,3
+1,10,5
+2,5,6
+2,1,4
\ No newline at end of file
diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md
index 48ca791ff..ced84ffa6 100644
--- a/docs/source/user-guide/example-usage.md
+++ b/docs/source/user-guide/example-usage.md
@@ -19,6 +19,10 @@
 
 # Example Usage
 
+In this example some simple processing is performed on a csv file. Please be aware that all identifiers are made lower-case in SQL, so if your csv file has capital letters (ex: Name) you should put your column name in double quotes or the example won't work.
+
+The following example uses [this file](../../../datafusion/core/tests/capitalized_example.csv)
+
 ## Update `Cargo.toml`
 
 Add the following to your `Cargo.toml` file:
@@ -37,10 +41,10 @@ use datafusion::prelude::*;
 async fn main() -> datafusion::error::Result<()> {
   // register the table
   let ctx = SessionContext::new();
-  ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new()).await?;
+  ctx.register_csv("example", "tests/capitalized_example.csv", CsvReadOptions::new()).await?;
 
   // create a plan to run a SQL query
-  let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?;
+  let df = ctx.sql("SELECT \"A\", MIN(b) FROM example GROUP BY \"A\" LIMIT 100").await?;
 
   // execute and print results
   df.show().await?;
@@ -57,10 +61,10 @@ use datafusion::prelude::*;
 async fn main() -> datafusion::error::Result<()> {
   // create the dataframe
   let ctx = SessionContext::new();
-  let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
+  let df = ctx.read_csv("tests/capitalized_example.csv", CsvReadOptions::new()).await?;
 
-  let df = df.filter(col("a").lt_eq(col("b")))?
-           .aggregate(vec![col("a")], vec![min(col("b"))])?;
+  let df = df.filter(col("A").lt_eq(col("c")))?
+           .aggregate(vec![col("A")], vec![min(col("b"))])?;
 
   // execute and print results
   df.show_limit(100).await?;
@@ -72,8 +76,9 @@ async fn main() -> datafusion::error::Result<()> {
 
 ```text
 +---+--------+
-| a | MIN(b) |
+| A | MIN(b) |
 +---+--------+
+| 2 | 1      |
 | 1 | 2      |
 +---+--------+
 ```
diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md
index 008981fde..3eea252d7 100644
--- a/docs/source/user-guide/sql/select.md
+++ b/docs/source/user-guide/sql/select.md
@@ -20,6 +20,9 @@
 # SELECT syntax
 
 The queries in DataFusion scan data from tables and return 0 or more rows.
+Please be aware that column names in queries are made lower-case, but not on the inferred schema. Accordingly, if you
+want to query against a capitalized field, make sure to use double quotes. Please see this
+[example](https://arrow.apache.org/datafusion/user-guide/example-usage.html) for clarification.
 In this documentation we describe the SQL syntax in DataFusion.
 
 DataFusion supports the following syntax for queries: