You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by bh...@apache.org on 2022/06/28 17:48:17 UTC

[hudi] branch asf-site updated: [DOCS] Fix Reliable ingestion from AWS S3 blog for configs (#5986)

This is an automated email from the ASF dual-hosted git repository.

bhavanisudha pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 89b33da0f8 [DOCS] Fix Reliable ingestion from AWS S3 blog for configs (#5986)
89b33da0f8 is described below

commit 89b33da0f8362efedb71765db93b0dac79383036
Author: Bhavani Sudha Saktheeswaran <21...@users.noreply.github.com>
AuthorDate: Tue Jun 28 10:48:09 2022 -0700

    [DOCS] Fix Reliable ingestion from AWS S3 blog for configs (#5986)
---
 website/blog/2021-08-23-s3-events-source.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/website/blog/2021-08-23-s3-events-source.md b/website/blog/2021-08-23-s3-events-source.md
index e48b89cfd1..541a9c30f0 100644
--- a/website/blog/2021-08-23-s3-events-source.md
+++ b/website/blog/2021-08-23-s3-events-source.md
@@ -79,20 +79,21 @@ spark-submit \
 --hoodie-conf hoodie.datasource.hive_sync.table=s3_meta_table \
 --hoodie-conf hoodie.datasource.hive_sync.partition_fields=bucket \
 --source-class org.apache.hudi.utilities.sources.S3EventsSource \
---hoodie-conf hoodie.deltastreamer.source.queue.url=https://sqs.us-west-2.amazonaws.com/queue/url
+--hoodie-conf hoodie.deltastreamer.s3.source.queue.url=https://sqs.us-west-2.amazonaws.com/queue/url
 --hoodie-conf hoodie.deltastreamer.s3.source.queue.region=us-west-2
 
-# To start S3EventsHoodieIncrSource
+# To start S3EventsHoodieIncrSource use following command along with ordering field, record key(s) and 
+# partition field(s) from the source s3 data.
 spark-submit \
 --jars "/home/hadoop/hudi-utilities-bundle_2.11-0.9.0.jar,/usr/lib/spark/external/lib/spark-avro.jar,/home/hadoop/aws-java-sdk-sqs-1.12.22.jar" \
 --master yarn --deploy-mode client \
 --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer /home/hadoop/hudi-packages/hudi-utilities-bundle_2.11-0.9.0-SNAPSHOT.jar \
 --table-type COPY_ON_WRITE \
---source-ordering-field eventTime --target-base-path s3://bucket_name/path/for/s3_hudi_table \
+--source-ordering-field <ordering key from source data> --target-base-path s3://bucket_name/path/for/s3_hudi_table \
 --target-table s3_hudi_table  --continuous --min-sync-interval-seconds 10 \
---hoodie-conf hoodie.datasource.write.recordkey.field="pull_request_id" \
+--hoodie-conf hoodie.datasource.write.recordkey.field="<record key from source data>" \
 --hoodie-conf hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator \
---hoodie-conf hoodie.datasource.write.partitionpath.field=s3.bucket.name --enable-hive-sync \
+--hoodie-conf hoodie.datasource.write.partitionpath.field=<partition key from source data> --enable-hive-sync \
 --hoodie-conf hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.MultiPartKeysValueExtractor \
 --hoodie-conf hoodie.datasource.write.hive_style_partitioning=true \
 --hoodie-conf hoodie.datasource.hive_sync.database=default \