You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by si...@apache.org on 2022/10/28 02:15:46 UTC
[hudi] branch asf-site updated: [MINOR][DOCS] removed validations from the versioned docs (#7079)
This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push:
new dfb0acffeb [MINOR][DOCS] removed validations from the versioned docs (#7079)
dfb0acffeb is described below
commit dfb0acffeb62993b21c53fca8ff8d939ab02e444
Author: Jon Vexler <jb...@gmail.com>
AuthorDate: Thu Oct 27 19:15:40 2022 -0700
[MINOR][DOCS] removed validations from the versioned docs (#7079)
Co-authored-by: Jonathan Vexler <=>
---
.../version-0.12.0/quick-start-guide.md | 42 +---------------------
.../version-0.12.1/quick-start-guide.md | 42 +---------------------
2 files changed, 2 insertions(+), 82 deletions(-)
diff --git a/website/versioned_docs/version-0.12.0/quick-start-guide.md b/website/versioned_docs/version-0.12.0/quick-start-guide.md
index 2610a88f30..ed7bb29698 100644
--- a/website/versioned_docs/version-0.12.0/quick-start-guide.md
+++ b/website/versioned_docs/version-0.12.0/quick-start-guide.md
@@ -189,7 +189,6 @@ import org.apache.hudi.common.model.HoodieRecord
val tableName = "hudi_trips_cow"
val basePath = "file:///tmp/hudi_trips_cow"
val dataGen = new DataGenerator
-val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
```
</TabItem>
@@ -200,7 +199,6 @@ val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare
tableName = "hudi_trips_cow"
basePath = "file:///tmp/hudi_trips_cow"
dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator()
-snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
```
</TabItem>
@@ -290,7 +288,7 @@ create table hudi_cow_nonpcf_tbl (
) using hudi;
--- create a mor non-partitioned table without preCombineField provided
+-- create a mor non-partitioned table with preCombineField provided
create table hudi_mor_tbl (
id int,
name string,
@@ -429,9 +427,6 @@ df.write.format("hudi").
option(TABLE_NAME, tableName).
mode(Overwrite).
save(basePath)
-
-// validations
-assert(df.except(spark.sql(snapshotQuery)).count() == 0)
```
:::info
`mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -468,9 +463,6 @@ df.write.format("hudi"). \
options(**hudi_options). \
mode("overwrite"). \
save(basePath)
-
-# validations
-assert spark.sql(snapshotQuery).exceptAll(df).count() == 0
```
:::info
`mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -713,7 +705,6 @@ values={[
```scala
// spark-shell
-val snapBeforeUpdate = spark.sql(snapshotQuery)
val updates = convertToStringList(dataGen.generateUpdates(10))
val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi").
@@ -724,10 +715,6 @@ df.write.format("hudi").
option(TABLE_NAME, tableName).
mode(Append).
save(basePath)
-
-// validations
-assert(spark.sql(snapshotQuery).intersect(df).count() == df.count())
-assert(spark.sql(snapshotQuery).except(df).except(snapBeforeUpdate).count() == 0)
```
:::note
Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -816,17 +803,12 @@ when not matched then
```python
# pyspark
-snapshotBeforeUpdate = spark.sql(snapshotQuery)
updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10))
df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(basePath)
-
-# validations
-assert spark.sql(snapshotQuery).intersect(df).count() == df.count()
-assert spark.sql(snapshotQuery).exceptAll(snapshotBeforeUpdate).exceptAll(df).count() == 0
```
:::note
Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -1122,7 +1104,6 @@ Delete records for the HoodieKeys passed in.<br/>
```scala
// spark-shell
-val snapshotBeforeDelete = spark.sql(snapshotQuery)
// fetch total records count
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
// fetch two records to be deleted
@@ -1151,10 +1132,6 @@ val roAfterDeleteViewDF = spark.
roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
// fetch should return (total - 2) records
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-// validations
-assert(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hardDeleteDf).count() == 0)
-assert(snapshotBeforeDelete.except(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).except(snapshotBeforeDelete).count() == 0)
```
:::note
Only `Append` mode is supported for delete operation.
@@ -1182,7 +1159,6 @@ Delete records for the HoodieKeys passed in.<br/>
```python
# pyspark
-snapshotBeforeDelete = spark.sql(snapshotQuery)
# fetch total records count
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
# fetch two records to be deleted
@@ -1216,10 +1192,6 @@ roAfterDeleteViewDF = spark. \
roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot")
# fetch should return (total - 2) records
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-# validations
-assert spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hard_delete_df).count() == 0
-assert snapshotBeforeDelete.excptAll(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).count() == 0
```
:::note
Only `Append` mode is supported for delete operation.
@@ -1256,7 +1228,6 @@ spark.
sort("partitionpath","uuid").
show(100, false)
-val snapshotBeforeOverwrite = spark.sql(snapshotQuery)
val inserts = convertToStringList(dataGen.generateInserts(10))
val df = spark.
read.json(spark.sparkContext.parallelize(inserts, 2)).
@@ -1278,11 +1249,6 @@ spark.
select("uuid","partitionpath").
sort("partitionpath","uuid").
show(100, false)
-
-// validations
-val withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-val expectedDf = withoutSanFran.union(df)
-assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
```
</TabItem>
@@ -1290,7 +1256,6 @@ assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
```python
# pyspark
-snapshotBeforeOverwrite = spark.sql(snapshotQuery)
self.spark.read.format("hudi"). \
load(basePath). \
select(["uuid", "partitionpath"]). \
@@ -1316,11 +1281,6 @@ spark.read.format("hudi"). \
select(["uuid", "partitionpath"]). \
sort(["partitionpath", "uuid"]). \
show(n=100, truncate=False)
-
-# validations
-withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-expectedDf = withoutSanFran.union(df)
-assert spark.sql(snapshotQuery).exceptAll(expectedDf).count() == 0
```
</TabItem>
diff --git a/website/versioned_docs/version-0.12.1/quick-start-guide.md b/website/versioned_docs/version-0.12.1/quick-start-guide.md
index 24df38e9aa..c610964f6c 100644
--- a/website/versioned_docs/version-0.12.1/quick-start-guide.md
+++ b/website/versioned_docs/version-0.12.1/quick-start-guide.md
@@ -189,7 +189,6 @@ import org.apache.hudi.common.model.HoodieRecord
val tableName = "hudi_trips_cow"
val basePath = "file:///tmp/hudi_trips_cow"
val dataGen = new DataGenerator
-val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
```
</TabItem>
@@ -200,7 +199,6 @@ val snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare
tableName = "hudi_trips_cow"
basePath = "file:///tmp/hudi_trips_cow"
dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator()
-snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"
```
</TabItem>
@@ -290,7 +288,7 @@ create table hudi_cow_nonpcf_tbl (
) using hudi;
--- create a mor non-partitioned table without preCombineField provided
+-- create a mor non-partitioned table with preCombineField provided
create table hudi_mor_tbl (
id int,
name string,
@@ -429,9 +427,6 @@ df.write.format("hudi").
option(TABLE_NAME, tableName).
mode(Overwrite).
save(basePath)
-
-// validations
-assert(df.except(spark.sql(snapshotQuery)).count() == 0)
```
:::info
`mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -468,9 +463,6 @@ df.write.format("hudi"). \
options(**hudi_options). \
mode("overwrite"). \
save(basePath)
-
-# validations
-assert spark.sql(snapshotQuery).exceptAll(df).count() == 0
```
:::info
`mode(Overwrite)` overwrites and recreates the table if it already exists.
@@ -713,7 +705,6 @@ values={[
```scala
// spark-shell
-val snapBeforeUpdate = spark.sql(snapshotQuery)
val updates = convertToStringList(dataGen.generateUpdates(10))
val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi").
@@ -724,10 +715,6 @@ df.write.format("hudi").
option(TABLE_NAME, tableName).
mode(Append).
save(basePath)
-
-// validations
-assert(spark.sql(snapshotQuery).intersect(df).count() == df.count())
-assert(spark.sql(snapshotQuery).except(df).except(snapBeforeUpdate).count() == 0)
```
:::note
Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -816,17 +803,12 @@ when not matched then
```python
# pyspark
-snapshotBeforeUpdate = spark.sql(snapshotQuery)
updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10))
df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(basePath)
-
-# validations
-assert spark.sql(snapshotQuery).intersect(df).count() == df.count()
-assert spark.sql(snapshotQuery).exceptAll(snapshotBeforeUpdate).exceptAll(df).count() == 0
```
:::note
Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
@@ -1122,7 +1104,6 @@ Delete records for the HoodieKeys passed in.<br/>
```scala
// spark-shell
-val snapshotBeforeDelete = spark.sql(snapshotQuery)
// fetch total records count
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
// fetch two records to be deleted
@@ -1151,10 +1132,6 @@ val roAfterDeleteViewDF = spark.
roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
// fetch should return (total - 2) records
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-// validations
-assert(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hardDeleteDf).count() == 0)
-assert(snapshotBeforeDelete.except(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).except(snapshotBeforeDelete).count() == 0)
```
:::note
Only `Append` mode is supported for delete operation.
@@ -1182,7 +1159,6 @@ Delete records for the HoodieKeys passed in.<br/>
```python
# pyspark
-snapshotBeforeDelete = spark.sql(snapshotQuery)
# fetch total records count
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
# fetch two records to be deleted
@@ -1216,10 +1192,6 @@ roAfterDeleteViewDF = spark. \
roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot")
# fetch should return (total - 2) records
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
-
-# validations
-assert spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot").intersect(hard_delete_df).count() == 0
-assert snapshotBeforeDelete.excptAll(spark.sql("select uuid, partitionpath, ts from hudi_trips_snapshot")).count() == 0
```
:::note
Only `Append` mode is supported for delete operation.
@@ -1256,7 +1228,6 @@ spark.
sort("partitionpath","uuid").
show(100, false)
-val snapshotBeforeOverwrite = spark.sql(snapshotQuery)
val inserts = convertToStringList(dataGen.generateInserts(10))
val df = spark.
read.json(spark.sparkContext.parallelize(inserts, 2)).
@@ -1278,11 +1249,6 @@ spark.
select("uuid","partitionpath").
sort("partitionpath","uuid").
show(100, false)
-
-// validations
-val withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-val expectedDf = withoutSanFran.union(df)
-assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
```
</TabItem>
@@ -1290,7 +1256,6 @@ assert(spark.sql(snapshotQuery).except(expectedDf).count() == 0)
```python
# pyspark
-snapshotBeforeOverwrite = spark.sql(snapshotQuery)
self.spark.read.format("hudi"). \
load(basePath). \
select(["uuid", "partitionpath"]). \
@@ -1316,11 +1281,6 @@ spark.read.format("hudi"). \
select(["uuid", "partitionpath"]). \
sort(["partitionpath", "uuid"]). \
show(n=100, truncate=False)
-
-# validations
-withoutSanFran = snapshotBeforeOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'")
-expectedDf = withoutSanFran.union(df)
-assert spark.sql(snapshotQuery).exceptAll(expectedDf).count() == 0
```
</TabItem>