You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@beam.apache.org by Lukasz Cwik <lc...@google.com> on 2018/08/24 20:26:21 UTC
Re: INFO:root:Executing Error when executing a pipeline on dataflow
It seems like we only mention the need for pip 7.0.0 on the python
quickstart page https://beam.apache.org/get-started/quickstart-py/
Would you like to submit a change to update it?
On Wed, Aug 22, 2018 at 9:31 AM OrielResearch Eila Arich-Landkof <
eila@orielresearch.org> wrote:
> The issue was with the pip version. --download was deprecated. I dont know
> where this need to be mentioned / fixed.
> running
> pip install pip==9.0.3
>
> solved the issue.
>
> Thanks,
> eila
>
> On Wed, Aug 22, 2018 at 11:20 AM OrielResearch Eila Arich-Landkof <
> eila@orielresearch.org> wrote:
>
>> I tried a simple pipeline which is runner perfectly on local runner and
>> the same issue on dataflow. see below. Is there anything at the environment
>> that need to be updated that I am not aware of?
>>
>> Many thanks for any reference.
>> Eila
>>
>> import apache_beam as beam
>> options = PipelineOptions()
>> google_cloud_options = options.view_as(GoogleCloudOptions)
>> google_cloud_options.project = 'PROJECT-ID'
>> google_cloud_options.job_name = 'try-debug'
>> google_cloud_options.staging_location = '%s/staging' % BUCKET_URL #'gs://archs4/staging'
>> google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL #'gs://archs4/temp'
>> options.view_as(StandardOptions).runner = 'DataflowRunner'
>>
>> p1 = beam.Pipeline(options=options)
>>
>> (p1 | 'read' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
>> | 'write' >> beam.io.WriteToText('gs://bucket/test.txt', num_shards=1)
>> )
>>
>> p1.run().wait_until_finish()
>>
>> will fire the following error:
>>
>> CalledProcessErrorTraceback (most recent call last)
>> <ipython-input-17-b4be63f7802f> in <module>()
>> 5 )
>> 6
>> ----> 7 p1.run().wait_until_finish()
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api)
>> 174 finally:
>> 175 shutil.rmtree(tmpdir)
>> --> 176 return self.runner.run(self)
>> 177
>> 178 def __enter__(self):
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline)
>> 250 # Create the job
>> 251 result = DataflowPipelineResult(
>> --> 252 self.dataflow_client.create_job(self.job), self)
>> 253
>> 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs)
>> 166 while True:
>> 167 try:
>> --> 168 return fun(*args, **kwargs)
>> 169 except Exception as exn: # pylint: disable=broad-except
>> 170 if not retry_filter(exn):
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job)
>> 423 def create_job(self, job):
>> 424 """Creates job description. May stage and/or submit for remote execution."""
>> --> 425 self.create_job_description(job)
>> 426
>> 427 # Stage and submit the job when necessary
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job)
>> 446 """Creates a job described by the workflow proto."""
>> 447 resources = dependency.stage_job_resources(
>> --> 448 job.options, file_copy=self._gcs_file_copy)
>> 449 job.proto.environment = Environment(
>> 450 packages=resources, options=job.options,
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache)
>> 377 else:
>> 378 sdk_remote_location = setup_options.sdk_location
>> --> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>> 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
>> 381 else:
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>> 462 elif sdk_remote_location == 'pypi':
>> 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)
>> --> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path)
>> 465 else:
>> 466 raise RuntimeError(
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir)
>> 525 '--no-binary', ':all:', '--no-deps']
>> 526 logging.info('Executing command: %s', cmd_args)
>> --> 527 processes.check_call(cmd_args)
>> 528 zip_expected = os.path.join(
>> 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>
>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs)
>> 42 if force_shell:
>> 43 kwargs['shell'] = True
>> ---> 44 return subprocess.check_call(*args, **kwargs)
>> 45
>> 46
>>
>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs)
>> 188 if cmd is None:
>> 189 cmd = popenargs[0]
>> --> 190 raise CalledProcessError(retcode, cmd)
>> 191 return 0
>> 192
>>
>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmpyyiizo', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>
>>
>>
>> On Wed, Aug 22, 2018 at 10:39 AM OrielResearch Eila Arich-Landkof <
>> eila@orielresearch.org> wrote:
>>
>>> Hello all,
>>>
>>> I am running a pipeline that used to be executed on dataflow with no
>>> issues. I am using the datalab environment. See below the error. To my
>>> understanding happening before the pipeline code is being is being executed.
>>> Any idea what went wrong?
>>>
>>> Thanks,
>>> Eila
>>>
>>>
>>> Executing the pipeline:
>>>
>>> *p.run().wait_until_finish()*
>>>
>>> The following error is being fired:
>>>
>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', 'setup.py', 'sdist', '--dist-dir', '/tmp/tmp_B0gnK']
>>> INFO:root:Starting GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz...
>>> INFO:oauth2client.client:Attempting refresh to obtain initial access_token
>>> INFO:root:Completed GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz
>>> INFO:root:Staging the SDK tarball from PyPI to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/dataflow_python_sdk.tar
>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']
>>>
>>> CalledProcessErrorTraceback (most recent call last)<ipython-input-27-1e5aeb8b7d9b> in <module>()----> 1 p.run().wait_until_finish()
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api) 174 finally: 175 shutil.rmtree(tmpdir)--> 176 return self.runner.run(self) 177 178 def __enter__(self):
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline) 250 # Create the job 251 result = DataflowPipelineResult(--> 252 self.dataflow_client.create_job(self.job), self) 253 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs) 166 while True: 167 try:--> 168 return fun(*args, **kwargs) 169 except Exception as exn: # pylint: disable=broad-except 170 if not retry_filter(exn):
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job) 423 def create_job(self, job): 424 """Creates job description. May stage and/or submit for remote execution."""--> 425 self.create_job_description(job) 426 427 # Stage and submit the job when necessary
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job) 446 """Creates a job described by the workflow proto.""" 447 resources = dependency.stage_job_resources(--> 448 job.options, file_copy=self._gcs_file_copy) 449 job.proto.environment = Environment( 450 packages=resources, options=job.options,
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache) 377 else: 378 sdk_remote_location = setup_options.sdk_location--> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE) 381 else:
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 462 elif sdk_remote_location == 'pypi': 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)--> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path) 465 else: 466 raise RuntimeError(
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir) 525 '--no-binary', ':all:', '--no-deps'] 526 logging.info('Executing command: %s', cmd_args)--> 527 processes.check_call(cmd_args) 528 zip_expected = os.path.join( 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs) 42 if force_shell: 43 kwargs['shell'] = True---> 44 return subprocess.check_call(*args, **kwargs) 45 46
>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs) 188 if cmd is None: 189 cmd = popenargs[0]--> 190 raise CalledProcessError(retcode, cmd) 191 return 0 192
>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>
>>>
>>>
>>> --
>>> Eila
>>> www.orielresearch.org
>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>
>>> m/Deep-Learning-In-Production/
>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>
>>>
>>>
>>
>> --
>> Eila
>> www.orielresearch.org
>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>
>> m/Deep-Learning-In-Production/
>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>
>>
>>
>
> --
> Eila
> www.orielresearch.org
> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
> p.co <https://www.meetup.com/Deep-Learning-In-Production/>
> m/Deep-Learning-In-Production/
> <https://www.meetup.com/Deep-Learning-In-Production/>
>
>
>
Re: INFO:root:Executing Error when executing a pipeline on dataflow
Posted by Ahmet Altay <al...@google.com>.
Ella, I believe you are using version 2.0.0. Use of --download flag is
fixed at head. (I do not recall the exact version of the fix, could be 2.2
or 2.3). If possible please try to use a newer version of Beam.
Also, as Luke suggested, we would welcome any contributions to the
documentation.
On Fri, Aug 24, 2018 at 1:26 PM, Lukasz Cwik <lc...@google.com> wrote:
> It seems like we only mention the need for pip 7.0.0 on the python
> quickstart page https://beam.apache.org/get-started/quickstart-py/
>
> Would you like to submit a change to update it?
>
> On Wed, Aug 22, 2018 at 9:31 AM OrielResearch Eila Arich-Landkof <
> eila@orielresearch.org> wrote:
>
>> The issue was with the pip version. --download was deprecated. I dont
>> know where this need to be mentioned / fixed.
>> running
>> pip install pip==9.0.3
>>
>> solved the issue.
>>
>> Thanks,
>> eila
>>
>> On Wed, Aug 22, 2018 at 11:20 AM OrielResearch Eila Arich-Landkof <
>> eila@orielresearch.org> wrote:
>>
>>> I tried a simple pipeline which is runner perfectly on local runner and
>>> the same issue on dataflow. see below. Is there anything at the environment
>>> that need to be updated that I am not aware of?
>>>
>>> Many thanks for any reference.
>>> Eila
>>>
>>> import apache_beam as beam
>>> options = PipelineOptions()
>>> google_cloud_options = options.view_as(GoogleCloudOptions)
>>> google_cloud_options.project = 'PROJECT-ID'
>>> google_cloud_options.job_name = 'try-debug'
>>> google_cloud_options.staging_location = '%s/staging' % BUCKET_URL #'gs://archs4/staging'
>>> google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL #'gs://archs4/temp'
>>> options.view_as(StandardOptions).runner = 'DataflowRunner'
>>>
>>> p1 = beam.Pipeline(options=options)
>>>
>>> (p1 | 'read' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
>>> | 'write' >> beam.io.WriteToText('gs://bucket/test.txt', num_shards=1)
>>> )
>>>
>>> p1.run().wait_until_finish()
>>>
>>> will fire the following error:
>>>
>>> CalledProcessErrorTraceback (most recent call last)
>>> <ipython-input-17-b4be63f7802f> in <module>()
>>> 5 )
>>> 6
>>> ----> 7 p1.run().wait_until_finish()
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api)
>>> 174 finally:
>>> 175 shutil.rmtree(tmpdir)
>>> --> 176 return self.runner.run(self)
>>> 177
>>> 178 def __enter__(self):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline)
>>> 250 # Create the job
>>> 251 result = DataflowPipelineResult(
>>> --> 252 self.dataflow_client.create_job(self.job), self)
>>> 253
>>> 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs)
>>> 166 while True:
>>> 167 try:
>>> --> 168 return fun(*args, **kwargs)
>>> 169 except Exception as exn: # pylint: disable=broad-except
>>> 170 if not retry_filter(exn):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job)
>>> 423 def create_job(self, job):
>>> 424 """Creates job description. May stage and/or submit for remote execution."""
>>> --> 425 self.create_job_description(job)
>>> 426
>>> 427 # Stage and submit the job when necessary
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job)
>>> 446 """Creates a job described by the workflow proto."""
>>> 447 resources = dependency.stage_job_resources(
>>> --> 448 job.options, file_copy=self._gcs_file_copy)
>>> 449 job.proto.environment = Environment(
>>> 450 packages=resources, options=job.options,
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache)
>>> 377 else:
>>> 378 sdk_remote_location = setup_options.sdk_location
>>> --> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
>>> 381 else:
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 462 elif sdk_remote_location == 'pypi':
>>> 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)
>>> --> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path)
>>> 465 else:
>>> 466 raise RuntimeError(
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir)
>>> 525 '--no-binary', ':all:', '--no-deps']
>>> 526 logging.info('Executing command: %s', cmd_args)
>>> --> 527 processes.check_call(cmd_args)
>>> 528 zip_expected = os.path.join(
>>> 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs)
>>> 42 if force_shell:
>>> 43 kwargs['shell'] = True
>>> ---> 44 return subprocess.check_call(*args, **kwargs)
>>> 45
>>> 46
>>>
>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs)
>>> 188 if cmd is None:
>>> 189 cmd = popenargs[0]
>>> --> 190 raise CalledProcessError(retcode, cmd)
>>> 191 return 0
>>> 192
>>>
>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmpyyiizo', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>
>>>
>>>
>>> On Wed, Aug 22, 2018 at 10:39 AM OrielResearch Eila Arich-Landkof <
>>> eila@orielresearch.org> wrote:
>>>
>>>> Hello all,
>>>>
>>>> I am running a pipeline that used to be executed on dataflow with no
>>>> issues. I am using the datalab environment. See below the error. To my
>>>> understanding happening before the pipeline code is being is being executed.
>>>> Any idea what went wrong?
>>>>
>>>> Thanks,
>>>> Eila
>>>>
>>>>
>>>> Executing the pipeline:
>>>>
>>>> *p.run().wait_until_finish()*
>>>>
>>>> The following error is being fired:
>>>>
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', 'setup.py', 'sdist', '--dist-dir', '/tmp/tmp_B0gnK']
>>>> INFO:root:Starting GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz...
>>>> INFO:oauth2client.client:Attempting refresh to obtain initial access_token
>>>> INFO:root:Completed GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz
>>>> INFO:root:Staging the SDK tarball from PyPI to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/dataflow_python_sdk.tar
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']
>>>>
>>>> CalledProcessErrorTraceback (most recent call last)<ipython-input-27-1e5aeb8b7d9b> in <module>()----> 1 p.run().wait_until_finish()
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api) 174 finally: 175 shutil.rmtree(tmpdir)--> 176 return self.runner.run(self) 177 178 def __enter__(self):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline) 250 # Create the job 251 result = DataflowPipelineResult(--> 252 self.dataflow_client.create_job(self.job), self) 253 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs) 166 while True: 167 try:--> 168 return fun(*args, **kwargs) 169 except Exception as exn: # pylint: disable=broad-except 170 if not retry_filter(exn):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job) 423 def create_job(self, job): 424 """Creates job description. May stage and/or submit for remote execution."""--> 425 self.create_job_description(job) 426 427 # Stage and submit the job when necessary
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job) 446 """Creates a job described by the workflow proto.""" 447 resources = dependency.stage_job_resources(--> 448 job.options, file_copy=self._gcs_file_copy) 449 job.proto.environment = Environment( 450 packages=resources, options=job.options,
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache) 377 else: 378 sdk_remote_location = setup_options.sdk_location--> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE) 381 else:
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 462 elif sdk_remote_location == 'pypi': 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)--> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path) 465 else: 466 raise RuntimeError(
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir) 525 '--no-binary', ':all:', '--no-deps'] 526 logging.info('Executing command: %s', cmd_args)--> 527 processes.check_call(cmd_args) 528 zip_expected = os.path.join( 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs) 42 if force_shell: 43 kwargs['shell'] = True---> 44 return subprocess.check_call(*args, **kwargs) 45 46
>>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs) 188 if cmd is None: 189 cmd = popenargs[0]--> 190 raise CalledProcessError(retcode, cmd) 191 return 0 192
>>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>>
>>>>
>>>>
>>>> --
>>>> Eila
>>>> www.orielresearch.org
>>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>>> Learning-In-Production/
>>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>>
>>>>
>>>>
>>>
>>> --
>>> Eila
>>> www.orielresearch.org
>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>> Learning-In-Production/
>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>
>>>
>>>
>>
>> --
>> Eila
>> www.orielresearch.org
>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>> Learning-In-Production/
>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>
>>
>>
Re: INFO:root:Executing Error when executing a pipeline on dataflow
Posted by Ahmet Altay <al...@google.com>.
Ella, I believe you are using version 2.0.0. Use of --download flag is
fixed at head. (I do not recall the exact version of the fix, could be 2.2
or 2.3). If possible please try to use a newer version of Beam.
Also, as Luke suggested, we would welcome any contributions to the
documentation.
On Fri, Aug 24, 2018 at 1:26 PM, Lukasz Cwik <lc...@google.com> wrote:
> It seems like we only mention the need for pip 7.0.0 on the python
> quickstart page https://beam.apache.org/get-started/quickstart-py/
>
> Would you like to submit a change to update it?
>
> On Wed, Aug 22, 2018 at 9:31 AM OrielResearch Eila Arich-Landkof <
> eila@orielresearch.org> wrote:
>
>> The issue was with the pip version. --download was deprecated. I dont
>> know where this need to be mentioned / fixed.
>> running
>> pip install pip==9.0.3
>>
>> solved the issue.
>>
>> Thanks,
>> eila
>>
>> On Wed, Aug 22, 2018 at 11:20 AM OrielResearch Eila Arich-Landkof <
>> eila@orielresearch.org> wrote:
>>
>>> I tried a simple pipeline which is runner perfectly on local runner and
>>> the same issue on dataflow. see below. Is there anything at the environment
>>> that need to be updated that I am not aware of?
>>>
>>> Many thanks for any reference.
>>> Eila
>>>
>>> import apache_beam as beam
>>> options = PipelineOptions()
>>> google_cloud_options = options.view_as(GoogleCloudOptions)
>>> google_cloud_options.project = 'PROJECT-ID'
>>> google_cloud_options.job_name = 'try-debug'
>>> google_cloud_options.staging_location = '%s/staging' % BUCKET_URL #'gs://archs4/staging'
>>> google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL #'gs://archs4/temp'
>>> options.view_as(StandardOptions).runner = 'DataflowRunner'
>>>
>>> p1 = beam.Pipeline(options=options)
>>>
>>> (p1 | 'read' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
>>> | 'write' >> beam.io.WriteToText('gs://bucket/test.txt', num_shards=1)
>>> )
>>>
>>> p1.run().wait_until_finish()
>>>
>>> will fire the following error:
>>>
>>> CalledProcessErrorTraceback (most recent call last)
>>> <ipython-input-17-b4be63f7802f> in <module>()
>>> 5 )
>>> 6
>>> ----> 7 p1.run().wait_until_finish()
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api)
>>> 174 finally:
>>> 175 shutil.rmtree(tmpdir)
>>> --> 176 return self.runner.run(self)
>>> 177
>>> 178 def __enter__(self):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline)
>>> 250 # Create the job
>>> 251 result = DataflowPipelineResult(
>>> --> 252 self.dataflow_client.create_job(self.job), self)
>>> 253
>>> 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs)
>>> 166 while True:
>>> 167 try:
>>> --> 168 return fun(*args, **kwargs)
>>> 169 except Exception as exn: # pylint: disable=broad-except
>>> 170 if not retry_filter(exn):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job)
>>> 423 def create_job(self, job):
>>> 424 """Creates job description. May stage and/or submit for remote execution."""
>>> --> 425 self.create_job_description(job)
>>> 426
>>> 427 # Stage and submit the job when necessary
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job)
>>> 446 """Creates a job described by the workflow proto."""
>>> 447 resources = dependency.stage_job_resources(
>>> --> 448 job.options, file_copy=self._gcs_file_copy)
>>> 449 job.proto.environment = Environment(
>>> 450 packages=resources, options=job.options,
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache)
>>> 377 else:
>>> 378 sdk_remote_location = setup_options.sdk_location
>>> --> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
>>> 381 else:
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 462 elif sdk_remote_location == 'pypi':
>>> 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)
>>> --> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path)
>>> 465 else:
>>> 466 raise RuntimeError(
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir)
>>> 525 '--no-binary', ':all:', '--no-deps']
>>> 526 logging.info('Executing command: %s', cmd_args)
>>> --> 527 processes.check_call(cmd_args)
>>> 528 zip_expected = os.path.join(
>>> 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs)
>>> 42 if force_shell:
>>> 43 kwargs['shell'] = True
>>> ---> 44 return subprocess.check_call(*args, **kwargs)
>>> 45
>>> 46
>>>
>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs)
>>> 188 if cmd is None:
>>> 189 cmd = popenargs[0]
>>> --> 190 raise CalledProcessError(retcode, cmd)
>>> 191 return 0
>>> 192
>>>
>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmpyyiizo', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>
>>>
>>>
>>> On Wed, Aug 22, 2018 at 10:39 AM OrielResearch Eila Arich-Landkof <
>>> eila@orielresearch.org> wrote:
>>>
>>>> Hello all,
>>>>
>>>> I am running a pipeline that used to be executed on dataflow with no
>>>> issues. I am using the datalab environment. See below the error. To my
>>>> understanding happening before the pipeline code is being is being executed.
>>>> Any idea what went wrong?
>>>>
>>>> Thanks,
>>>> Eila
>>>>
>>>>
>>>> Executing the pipeline:
>>>>
>>>> *p.run().wait_until_finish()*
>>>>
>>>> The following error is being fired:
>>>>
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', 'setup.py', 'sdist', '--dist-dir', '/tmp/tmp_B0gnK']
>>>> INFO:root:Starting GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz...
>>>> INFO:oauth2client.client:Attempting refresh to obtain initial access_token
>>>> INFO:root:Completed GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz
>>>> INFO:root:Staging the SDK tarball from PyPI to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/dataflow_python_sdk.tar
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']
>>>>
>>>> CalledProcessErrorTraceback (most recent call last)<ipython-input-27-1e5aeb8b7d9b> in <module>()----> 1 p.run().wait_until_finish()
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api) 174 finally: 175 shutil.rmtree(tmpdir)--> 176 return self.runner.run(self) 177 178 def __enter__(self):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline) 250 # Create the job 251 result = DataflowPipelineResult(--> 252 self.dataflow_client.create_job(self.job), self) 253 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs) 166 while True: 167 try:--> 168 return fun(*args, **kwargs) 169 except Exception as exn: # pylint: disable=broad-except 170 if not retry_filter(exn):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job) 423 def create_job(self, job): 424 """Creates job description. May stage and/or submit for remote execution."""--> 425 self.create_job_description(job) 426 427 # Stage and submit the job when necessary
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job) 446 """Creates a job described by the workflow proto.""" 447 resources = dependency.stage_job_resources(--> 448 job.options, file_copy=self._gcs_file_copy) 449 job.proto.environment = Environment( 450 packages=resources, options=job.options,
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache) 377 else: 378 sdk_remote_location = setup_options.sdk_location--> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE) 381 else:
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 462 elif sdk_remote_location == 'pypi': 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)--> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path) 465 else: 466 raise RuntimeError(
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir) 525 '--no-binary', ':all:', '--no-deps'] 526 logging.info('Executing command: %s', cmd_args)--> 527 processes.check_call(cmd_args) 528 zip_expected = os.path.join( 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs) 42 if force_shell: 43 kwargs['shell'] = True---> 44 return subprocess.check_call(*args, **kwargs) 45 46
>>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs) 188 if cmd is None: 189 cmd = popenargs[0]--> 190 raise CalledProcessError(retcode, cmd) 191 return 0 192
>>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>>
>>>>
>>>>
>>>> --
>>>> Eila
>>>> www.orielresearch.org
>>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>>> Learning-In-Production/
>>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>>
>>>>
>>>>
>>>
>>> --
>>> Eila
>>> www.orielresearch.org
>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>> Learning-In-Production/
>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>
>>>
>>>
>>
>> --
>> Eila
>> www.orielresearch.org
>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>> Learning-In-Production/
>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>
>>
>>