You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by da...@apache.org on 2022/08/26 18:25:30 UTC
[beam] branch master updated: Update wordcount_minimal.py by removing pipeline_args.extend (#22786)
This is an automated email from the ASF dual-hosted git repository.
damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 39967a7ba3d Update wordcount_minimal.py by removing pipeline_args.extend (#22786)
39967a7ba3d is described below
commit 39967a7ba3dbb391db0757c7567a7e8c45f85fe8
Author: liferoad <hu...@gmail.com>
AuthorDate: Fri Aug 26 14:25:19 2022 -0400
Update wordcount_minimal.py by removing pipeline_args.extend (#22786)
* use known args
* use pipeline_args
Co-authored-by: XQ Hu <xq...@google.com>
---
.gitignore | 3 ++
.../apache_beam/examples/wordcount_minimal.py | 40 +++++++++++-----------
2 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/.gitignore b/.gitignore
index 45c4b6505d1..443ced0aaee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,9 @@ sdks/python/coverage.xml
# Ignore Jupyter notebook checkpoints.
**/.ipynb_checkpoints/**/*
+# Ignore python venv
+**/venv
+
# NOTE: if you modify this file, you probably need to modify the file set that
# is an input to 'maven-assembly-plugin' that generates source distribution.
# This is typically in files named 'src.xml' throughout this repository.
diff --git a/sdks/python/apache_beam/examples/wordcount_minimal.py b/sdks/python/apache_beam/examples/wordcount_minimal.py
index 2159be221e4..7987af0d905 100644
--- a/sdks/python/apache_beam/examples/wordcount_minimal.py
+++ b/sdks/python/apache_beam/examples/wordcount_minimal.py
@@ -83,30 +83,30 @@ def main(argv=None, save_main_session=True):
parser.add_argument(
'--output',
dest='output',
- # CHANGE 1/6: The Google Cloud Storage path is required
+ # CHANGE 1/6: (OPTIONAL) The Google Cloud Storage path is required
# for outputting the results.
default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
help='Output file to write results to.')
+
+ # If you use DataflowRunner, below options can be passed:
+ # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
+ # run your pipeline on the Google Cloud Dataflow Service.
+ # '--runner=DirectRunner',
+ # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
+ # run your pipeline on the Google Cloud Dataflow Service.
+ # '--project=SET_YOUR_PROJECT_ID_HERE',
+ # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
+ # is required in order to run your pipeline on the Google Cloud
+ # Dataflow Service.
+ # '--region=SET_REGION_HERE',
+ # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
+ # files.
+ # '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
+ # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
+ # files.
+ # '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
+ # '--job_name=your-wordcount-job',
known_args, pipeline_args = parser.parse_known_args(argv)
- pipeline_args.extend([
- # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
- # run your pipeline on the Google Cloud Dataflow Service.
- '--runner=DirectRunner',
- # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
- # run your pipeline on the Google Cloud Dataflow Service.
- '--project=SET_YOUR_PROJECT_ID_HERE',
- # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
- # is required in order to run your pipeline on the Google Cloud
- # Dataflow Service.
- '--region=SET_REGION_HERE',
- # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
- # files.
- '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
- # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
- # files.
- '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
- '--job_name=your-wordcount-job',
- ])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).