You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@beam.apache.org by Ahmet Altay <al...@google.com> on 2018/09/05 18:06:18 UTC
Re: INFO:root:Executing Error when executing a pipeline on dataflow
Ella, I believe you are using version 2.0.0. Use of --download flag is
fixed at head. (I do not recall the exact version of the fix, could be 2.2
or 2.3). If possible please try to use a newer version of Beam.
Also, as Luke suggested, we would welcome any contributions to the
documentation.
On Fri, Aug 24, 2018 at 1:26 PM, Lukasz Cwik <lc...@google.com> wrote:
> It seems like we only mention the need for pip 7.0.0 on the python
> quickstart page https://beam.apache.org/get-started/quickstart-py/
>
> Would you like to submit a change to update it?
>
> On Wed, Aug 22, 2018 at 9:31 AM OrielResearch Eila Arich-Landkof <
> eila@orielresearch.org> wrote:
>
>> The issue was with the pip version. --download was deprecated. I dont
>> know where this need to be mentioned / fixed.
>> running
>> pip install pip==9.0.3
>>
>> solved the issue.
>>
>> Thanks,
>> eila
>>
>> On Wed, Aug 22, 2018 at 11:20 AM OrielResearch Eila Arich-Landkof <
>> eila@orielresearch.org> wrote:
>>
>>> I tried a simple pipeline which is runner perfectly on local runner and
>>> the same issue on dataflow. see below. Is there anything at the environment
>>> that need to be updated that I am not aware of?
>>>
>>> Many thanks for any reference.
>>> Eila
>>>
>>> import apache_beam as beam
>>> options = PipelineOptions()
>>> google_cloud_options = options.view_as(GoogleCloudOptions)
>>> google_cloud_options.project = 'PROJECT-ID'
>>> google_cloud_options.job_name = 'try-debug'
>>> google_cloud_options.staging_location = '%s/staging' % BUCKET_URL #'gs://archs4/staging'
>>> google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL #'gs://archs4/temp'
>>> options.view_as(StandardOptions).runner = 'DataflowRunner'
>>>
>>> p1 = beam.Pipeline(options=options)
>>>
>>> (p1 | 'read' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
>>> | 'write' >> beam.io.WriteToText('gs://bucket/test.txt', num_shards=1)
>>> )
>>>
>>> p1.run().wait_until_finish()
>>>
>>> will fire the following error:
>>>
>>> CalledProcessErrorTraceback (most recent call last)
>>> <ipython-input-17-b4be63f7802f> in <module>()
>>> 5 )
>>> 6
>>> ----> 7 p1.run().wait_until_finish()
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api)
>>> 174 finally:
>>> 175 shutil.rmtree(tmpdir)
>>> --> 176 return self.runner.run(self)
>>> 177
>>> 178 def __enter__(self):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline)
>>> 250 # Create the job
>>> 251 result = DataflowPipelineResult(
>>> --> 252 self.dataflow_client.create_job(self.job), self)
>>> 253
>>> 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs)
>>> 166 while True:
>>> 167 try:
>>> --> 168 return fun(*args, **kwargs)
>>> 169 except Exception as exn: # pylint: disable=broad-except
>>> 170 if not retry_filter(exn):
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job)
>>> 423 def create_job(self, job):
>>> 424 """Creates job description. May stage and/or submit for remote execution."""
>>> --> 425 self.create_job_description(job)
>>> 426
>>> 427 # Stage and submit the job when necessary
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job)
>>> 446 """Creates a job described by the workflow proto."""
>>> 447 resources = dependency.stage_job_resources(
>>> --> 448 job.options, file_copy=self._gcs_file_copy)
>>> 449 job.proto.environment = Environment(
>>> 450 packages=resources, options=job.options,
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache)
>>> 377 else:
>>> 378 sdk_remote_location = setup_options.sdk_location
>>> --> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
>>> 381 else:
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
>>> 462 elif sdk_remote_location == 'pypi':
>>> 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)
>>> --> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path)
>>> 465 else:
>>> 466 raise RuntimeError(
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir)
>>> 525 '--no-binary', ':all:', '--no-deps']
>>> 526 logging.info('Executing command: %s', cmd_args)
>>> --> 527 processes.check_call(cmd_args)
>>> 528 zip_expected = os.path.join(
>>> 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>
>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs)
>>> 42 if force_shell:
>>> 43 kwargs['shell'] = True
>>> ---> 44 return subprocess.check_call(*args, **kwargs)
>>> 45
>>> 46
>>>
>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs)
>>> 188 if cmd is None:
>>> 189 cmd = popenargs[0]
>>> --> 190 raise CalledProcessError(retcode, cmd)
>>> 191 return 0
>>> 192
>>>
>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmpyyiizo', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>
>>>
>>>
>>> On Wed, Aug 22, 2018 at 10:39 AM OrielResearch Eila Arich-Landkof <
>>> eila@orielresearch.org> wrote:
>>>
>>>> Hello all,
>>>>
>>>> I am running a pipeline that used to be executed on dataflow with no
>>>> issues. I am using the datalab environment. See below the error. To my
>>>> understanding happening before the pipeline code is being is being executed.
>>>> Any idea what went wrong?
>>>>
>>>> Thanks,
>>>> Eila
>>>>
>>>>
>>>> Executing the pipeline:
>>>>
>>>> *p.run().wait_until_finish()*
>>>>
>>>> The following error is being fired:
>>>>
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', 'setup.py', 'sdist', '--dist-dir', '/tmp/tmp_B0gnK']
>>>> INFO:root:Starting GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz...
>>>> INFO:oauth2client.client:Attempting refresh to obtain initial access_token
>>>> INFO:root:Completed GCS upload to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/workflow.tar.gz
>>>> INFO:root:Staging the SDK tarball from PyPI to gs://archs4/staging/label-archs4-annotation-15.1534948236.075799/dataflow_python_sdk.tar
>>>> INFO:root:Executing command: ['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']
>>>>
>>>> CalledProcessErrorTraceback (most recent call last)<ipython-input-27-1e5aeb8b7d9b> in <module>()----> 1 p.run().wait_until_finish()
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/pipeline.pyc in run(self, test_runner_api) 174 finally: 175 shutil.rmtree(tmpdir)--> 176 return self.runner.run(self) 177 178 def __enter__(self):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.pyc in run(self, pipeline) 250 # Create the job 251 result = DataflowPipelineResult(--> 252 self.dataflow_client.create_job(self.job), self) 253 254 self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/retry.pyc in wrapper(*args, **kwargs) 166 while True: 167 try:--> 168 return fun(*args, **kwargs) 169 except Exception as exn: # pylint: disable=broad-except 170 if not retry_filter(exn):
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job(self, job) 423 def create_job(self, job): 424 """Creates job description. May stage and/or submit for remote execution."""--> 425 self.create_job_description(job) 426 427 # Stage and submit the job when necessary
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.pyc in create_job_description(self, job) 446 """Creates a job described by the workflow proto.""" 447 resources = dependency.stage_job_resources(--> 448 job.options, file_copy=self._gcs_file_copy) 449 job.proto.environment = Environment( 450 packages=resources, options=job.options,
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in stage_job_resources(options, file_copy, build_setup_args, temp_dir, populate_requirements_cache) 377 else: 378 sdk_remote_location = setup_options.sdk_location--> 379 _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 380 resources.append(names.DATAFLOW_SDK_TARBALL_FILE) 381 else:
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) 462 elif sdk_remote_location == 'pypi': 463 logging.info('Staging the SDK tarball from PyPI to %s', staged_path)--> 464 _dependency_file_copy(_download_pypi_sdk_package(temp_dir), staged_path) 465 else: 466 raise RuntimeError(
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/dependency.pyc in _download_pypi_sdk_package(temp_dir) 525 '--no-binary', ':all:', '--no-deps'] 526 logging.info('Executing command: %s', cmd_args)--> 527 processes.check_call(cmd_args) 528 zip_expected = os.path.join( 529 temp_dir, '%s-%s.zip' % (package_name, version))
>>>> /usr/local/envs/py2env/lib/python2.7/site-packages/apache_beam/utils/processes.pyc in check_call(*args, **kwargs) 42 if force_shell: 43 kwargs['shell'] = True---> 44 return subprocess.check_call(*args, **kwargs) 45 46
>>>> /usr/local/envs/py2env/lib/python2.7/subprocess.pyc in check_call(*popenargs, **kwargs) 188 if cmd is None: 189 cmd = popenargs[0]--> 190 raise CalledProcessError(retcode, cmd) 191 return 0 192
>>>> CalledProcessError: Command '['/usr/local/envs/py2env/bin/python', '-m', 'pip', 'install', '--download', '/tmp/tmp_B0gnK', 'google-cloud-dataflow==2.0.0', '--no-binary', ':all:', '--no-deps']' returned non-zero exit status 2
>>>>
>>>>
>>>>
>>>> --
>>>> Eila
>>>> www.orielresearch.org
>>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>>> Learning-In-Production/
>>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>>
>>>>
>>>>
>>>
>>> --
>>> Eila
>>> www.orielresearch.org
>>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>>> Learning-In-Production/
>>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>>
>>>
>>>
>>
>> --
>> Eila
>> www.orielresearch.org
>> https://www.meetu <https://www.meetup.com/Deep-Learning-In-Production/>
>> p.co <https://www.meetup.com/Deep-Learning-In-Production/>m/Deep-
>> Learning-In-Production/
>> <https://www.meetup.com/Deep-Learning-In-Production/>
>>
>>
>>