You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by si...@apache.org on 2022/10/28 02:15:46 UTC

[hudi] branch asf-site updated: [MINOR][DOCS] removed validations from the versioned docs (#7079)

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new dfb0acffeb [MINOR][DOCS] removed validations from the versioned docs (#7079)
dfb0acffeb is described below

commit dfb0acffeb62993b21c53fca8ff8d939ab02e444
Author: Jon Vexler <jb...@gmail.com>
AuthorDate: Thu Oct 27 19:15:40 2022 -0700

    [MINOR][DOCS] removed validations from the versioned docs (#7079)
    
    Co-authored-by: Jonathan Vexler <=>
---
 .../version-0.12.0/quick-start-guide.md            | 42 +---------------------
 .../version-0.12.1/quick-start-guide.md            | 42 +---------------------
 2 files changed, 2 insertions(+), 82 deletions(-)

diff --git a/website/versioned_docs/version-0.12.0/quick-start-guide.md b/website/versioned_docs/version-0.12.0/quick-start-guide.md
index 2610a88f30..ed7bb29698 100644
--- a/website/versioned_docs/version-0.12.0/quick-start-guide.md
+++ b/website/versioned_docs/version-0.12.0/quick-start-guide.md
@@ -189,7 +189,6 @@ import org.apache.hudi.common.model.HoodieRecord
 val tableName = "hudi_trips_cow"
 val basePath = "file:///tmp/hudi_trips_cow"
 val dataGen = new DataGenerator
-val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
 ```
 
 </TabItem>
@@ -200,7 +199,6 @@ val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare
 tableName = "hudi_trips_cow"
 basePath = "file:///tmp/hudi_trips_cow"
 dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator()
-snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
 ```
 
 </TabItem>
@@ -290,7 +288,7 @@ create table hudi_cow_nonpcf_tbl (
 ) using hudi;
 
 
--- create a mor non-partitioned table without preCombineField provided
+-- create a mor non-partitioned table with preCombineField provided
 create table hudi_mor_tbl (
   id int,
   name string,
@@ -429,9 +427,6 @@ df.write.format("hudi").
   option(TABLE_NAME, tableName).
   mode(Overwrite).
   save(basePath)
-  
-// validations
-assert(df.except(spark.sql(snapshotQuery)).count() == 0)
 ```
 :::info
 `mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -468,9 +463,6 @@ df.write.format("hudi"). \
     options(**hudi_options). \
     mode("overwrite"). \
     save(basePath)
-    
-# validations
-assert spark.sql(snapshotQuery).exceptAll(df).count() == 0
 ```
 :::info
 `mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -713,7 +705,6 @@ values={[
 
 ```scala
 // spark-shell
-val snapBeforeUpdate = spark.sql(snapshotQuery)
 val updates = convertToStringList(dataGen.generateUpdates(10))
 val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
 df.write.format("hudi").
@@ -724,10 +715,6 @@ df.write.format("hudi").
   option(TABLE_NAME, tableName).
   mode(Append).
   save(basePath)
-  
-// validations
-assert(spark.sql(snapshotQuery).intersect(df).count() == df.count())
-assert(spark.sql(snapshotQuery).except(df).except(snapBeforeUpdate).count() == 0)
 ```
 :::note
 Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -816,17 +803,12 @@ when not matched then
 
 ```python
 # pyspark
-snapshotBeforeUpdate = spark.sql(snapshotQuery)
 updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10))
 df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
 df.write.format("hudi"). \
   options(**hudi_options). \
   mode("append"). \
   save(basePath)
-  
-# validations
-assert spark.sql(snapshotQuery).intersect(df).count() == df.count()
-assert spark.sql(snapshotQuery).exceptAll(snapshotBeforeUpdate).exceptAll(df).count() == 0
 ```
 :::note
 Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -1122,7 +1104,6 @@ Delete records for the HoodieKeys passed in.<br/>
 
 ```scala
 // spark-shell
-val snapshotBeforeDelete = spark.sql(snapshotQuery)
 // fetch total records count
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
 // fetch two records to be deleted
@@ -1151,10 +1132,6 @@ val roAfterDeleteViewDF = spark.
 roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
 // fetch should return (total - 2) records
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-// validations
-assert(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hardDeleteDf).count() == 0)
-assert(snapshotBeforeDelete.except(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).except(snapshotBeforeDelete).count() == 0)
 ```
 :::note
 Only `Append` mode is supported for delete operation.
@@ -1182,7 +1159,6 @@ Delete records for the HoodieKeys passed in.<br/>
 
 ```python
 # pyspark
-snapshotBeforeDelete = spark.sql(snapshotQuery)
 # fetch total records count
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
 # fetch two records to be deleted
@@ -1216,10 +1192,6 @@ roAfterDeleteViewDF = spark. \
 roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot")
 # fetch should return (total - 2) records
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-# validations
-assert spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hard_delete_df).count() == 0
-assert snapshotBeforeDelete.excptAll(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).count() == 0
 ```
 :::note
 Only `Append` mode is supported for delete operation.
@@ -1256,7 +1228,6 @@ spark.
   sort("partitionpath","uuid").
   show(100, false)
 
-val snapshotBeforeOverwrite = spark.sql(snapshotQuery)
 val inserts = convertToStringList(dataGen.generateInserts(10))
 val df = spark.
   read.json(spark.sparkContext.parallelize(inserts, 2)).
@@ -1278,11 +1249,6 @@ spark.
   select("uuid","partitionpath").
   sort("partitionpath","uuid").
   show(100, false)
-  
-// validations
-val withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-val expectedDf = withoutSanFran.union(df)
-assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
 ```
 </TabItem>
 
@@ -1290,7 +1256,6 @@ assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
 
 ```python
 # pyspark
-snapshotBeforeOverwrite = spark.sql(snapshotQuery)
 self.spark.read.format("hudi"). \
     load(basePath). \
     select(["uuid", "partitionpath"]). \
@@ -1316,11 +1281,6 @@ spark.read.format("hudi"). \
     select(["uuid", "partitionpath"]). \
     sort(["partitionpath", "uuid"]). \
     show(n=100, truncate=False)
-
-# validations
-withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-expectedDf = withoutSanFran.union(df)
-assert spark.sql(snapshotQuery).exceptAll(expectedDf).count() == 0
 ```
 </TabItem>
 
diff --git a/website/versioned_docs/version-0.12.1/quick-start-guide.md b/website/versioned_docs/version-0.12.1/quick-start-guide.md
index 24df38e9aa..c610964f6c 100644
--- a/website/versioned_docs/version-0.12.1/quick-start-guide.md
+++ b/website/versioned_docs/version-0.12.1/quick-start-guide.md
@@ -189,7 +189,6 @@ import org.apache.hudi.common.model.HoodieRecord
 val tableName = "hudi_trips_cow"
 val basePath = "file:///tmp/hudi_trips_cow"
 val dataGen = new DataGenerator
-val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
 ```
 
 </TabItem>
@@ -200,7 +199,6 @@ val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare
 tableName = "hudi_trips_cow"
 basePath = "file:///tmp/hudi_trips_cow"
 dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator()
-snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
 ```
 
 </TabItem>
@@ -290,7 +288,7 @@ create table hudi_cow_nonpcf_tbl (
 ) using hudi;
 
 
--- create a mor non-partitioned table without preCombineField provided
+-- create a mor non-partitioned table with preCombineField provided
 create table hudi_mor_tbl (
   id int,
   name string,
@@ -429,9 +427,6 @@ df.write.format("hudi").
   option(TABLE_NAME, tableName).
   mode(Overwrite).
   save(basePath)
-  
-// validations
-assert(df.except(spark.sql(snapshotQuery)).count() == 0)
 ```
 :::info
 `mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -468,9 +463,6 @@ df.write.format("hudi"). \
     options(**hudi_options). \
     mode("overwrite"). \
     save(basePath)
-    
-# validations
-assert spark.sql(snapshotQuery).exceptAll(df).count() == 0
 ```
 :::info
 `mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -713,7 +705,6 @@ values={[
 
 ```scala
 // spark-shell
-val snapBeforeUpdate = spark.sql(snapshotQuery)
 val updates = convertToStringList(dataGen.generateUpdates(10))
 val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
 df.write.format("hudi").
@@ -724,10 +715,6 @@ df.write.format("hudi").
   option(TABLE_NAME, tableName).
   mode(Append).
   save(basePath)
-  
-// validations
-assert(spark.sql(snapshotQuery).intersect(df).count() == df.count())
-assert(spark.sql(snapshotQuery).except(df).except(snapBeforeUpdate).count() == 0)
 ```
 :::note
 Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -816,17 +803,12 @@ when not matched then
 
 ```python
 # pyspark
-snapshotBeforeUpdate = spark.sql(snapshotQuery)
 updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10))
 df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
 df.write.format("hudi"). \
   options(**hudi_options). \
   mode("append"). \
   save(basePath)
-  
-# validations
-assert spark.sql(snapshotQuery).intersect(df).count() == df.count()
-assert spark.sql(snapshotQuery).exceptAll(snapshotBeforeUpdate).exceptAll(df).count() == 0
 ```
 :::note
 Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -1122,7 +1104,6 @@ Delete records for the HoodieKeys passed in.<br/>
 
 ```scala
 // spark-shell
-val snapshotBeforeDelete = spark.sql(snapshotQuery)
 // fetch total records count
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
 // fetch two records to be deleted
@@ -1151,10 +1132,6 @@ val roAfterDeleteViewDF = spark.
 roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
 // fetch should return (total - 2) records
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-// validations
-assert(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hardDeleteDf).count() == 0)
-assert(snapshotBeforeDelete.except(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).except(snapshotBeforeDelete).count() == 0)
 ```
 :::note
 Only `Append` mode is supported for delete operation.
@@ -1182,7 +1159,6 @@ Delete records for the HoodieKeys passed in.<br/>
 
 ```python
 # pyspark
-snapshotBeforeDelete = spark.sql(snapshotQuery)
 # fetch total records count
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
 # fetch two records to be deleted
@@ -1216,10 +1192,6 @@ roAfterDeleteViewDF = spark. \
 roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot")
 # fetch should return (total - 2) records
 spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-# validations
-assert spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hard_delete_df).count() == 0
-assert snapshotBeforeDelete.excptAll(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).count() == 0
 ```
 :::note
 Only `Append` mode is supported for delete operation.
@@ -1256,7 +1228,6 @@ spark.
   sort("partitionpath","uuid").
   show(100, false)
 
-val snapshotBeforeOverwrite = spark.sql(snapshotQuery)
 val inserts = convertToStringList(dataGen.generateInserts(10))
 val df = spark.
   read.json(spark.sparkContext.parallelize(inserts, 2)).
@@ -1278,11 +1249,6 @@ spark.
   select("uuid","partitionpath").
   sort("partitionpath","uuid").
   show(100, false)
-  
-// validations
-val withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-val expectedDf = withoutSanFran.union(df)
-assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
 ```
 </TabItem>
 
@@ -1290,7 +1256,6 @@ assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
 
 ```python
 # pyspark
-snapshotBeforeOverwrite = spark.sql(snapshotQuery)
 self.spark.read.format("hudi"). \
     load(basePath). \
     select(["uuid", "partitionpath"]). \
@@ -1316,11 +1281,6 @@ spark.read.format("hudi"). \
     select(["uuid", "partitionpath"]). \
     sort(["partitionpath", "uuid"]). \
     show(n=100, truncate=False)
-
-# validations
-withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-expectedDf = withoutSanFran.union(df)
-assert spark.sql(snapshotQuery).exceptAll(expectedDf).count() == 0
 ```
 </TabItem>