You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by da...@apache.org on 2022/08/26 18:25:30 UTC

[beam] branch master updated: Update wordcount_minimal.py by removing pipeline_args.extend (#22786)

This is an automated email from the ASF dual-hosted git repository.

damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new 39967a7ba3d Update wordcount_minimal.py by removing pipeline_args.extend (#22786)
39967a7ba3d is described below

commit 39967a7ba3dbb391db0757c7567a7e8c45f85fe8
Author: liferoad <hu...@gmail.com>
AuthorDate: Fri Aug 26 14:25:19 2022 -0400

    Update wordcount_minimal.py by removing pipeline_args.extend (#22786)
    
    * use known args
    
    * use pipeline_args
    
    Co-authored-by: XQ Hu <xq...@google.com>
---
 .gitignore                                         |  3 ++
 .../apache_beam/examples/wordcount_minimal.py      | 40 +++++++++++-----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 45c4b6505d1..443ced0aaee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,9 @@ sdks/python/coverage.xml
 # Ignore Jupyter notebook checkpoints.
 **/.ipynb_checkpoints/**/*
 
+# Ignore python venv
+**/venv
+
 # NOTE: if you modify this file, you probably need to modify the file set that
 # is an input to 'maven-assembly-plugin' that generates source distribution.
 # This is typically in files named 'src.xml' throughout this repository.
diff --git a/sdks/python/apache_beam/examples/wordcount_minimal.py b/sdks/python/apache_beam/examples/wordcount_minimal.py
index 2159be221e4..7987af0d905 100644
--- a/sdks/python/apache_beam/examples/wordcount_minimal.py
+++ b/sdks/python/apache_beam/examples/wordcount_minimal.py
@@ -83,30 +83,30 @@ def main(argv=None, save_main_session=True):
   parser.add_argument(
       '--output',
       dest='output',
-      # CHANGE 1/6: The Google Cloud Storage path is required
+      # CHANGE 1/6: (OPTIONAL) The Google Cloud Storage path is required
       # for outputting the results.
       default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
       help='Output file to write results to.')
+
+  # If you use DataflowRunner, below options can be passed:
+  #   CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
+  #   run your pipeline on the Google Cloud Dataflow Service.
+  #   '--runner=DirectRunner',
+  #   CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
+  #   run your pipeline on the Google Cloud Dataflow Service.
+  #   '--project=SET_YOUR_PROJECT_ID_HERE',
+  #   CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
+  #   is required in order to run your pipeline on the Google Cloud
+  #   Dataflow Service.
+  #   '--region=SET_REGION_HERE',
+  #   CHANGE 5/6: Your Google Cloud Storage path is required for staging local
+  #   files.
+  #   '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
+  #   CHANGE 6/6: Your Google Cloud Storage path is required for temporary
+  #   files.
+  #   '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
+  #   '--job_name=your-wordcount-job',
   known_args, pipeline_args = parser.parse_known_args(argv)
-  pipeline_args.extend([
-      # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
-      # run your pipeline on the Google Cloud Dataflow Service.
-      '--runner=DirectRunner',
-      # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
-      # run your pipeline on the Google Cloud Dataflow Service.
-      '--project=SET_YOUR_PROJECT_ID_HERE',
-      # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
-      # is required in order to run your pipeline on the Google Cloud
-      # Dataflow Service.
-      '--region=SET_REGION_HERE',
-      # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
-      # files.
-      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
-      # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
-      # files.
-      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
-      '--job_name=your-wordcount-job',
-  ])
 
   # We use the save_main_session option because one or more DoFn's in this
   # workflow rely on global context (e.g., a module imported at module level).