You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@beam.apache.org by "Beam JIRA Bot (Jira)" <ji...@apache.org> on 2021/01/04 17:13:02 UTC
[jira] [Commented] (BEAM-11098) Running Apache Beam to distribute
the cleaning of a dataset in Google Cloud Dataflow
[ https://issues.apache.org/jira/browse/BEAM-11098?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17258332#comment-17258332 ]
Beam JIRA Bot commented on BEAM-11098:
--------------------------------------
This issue was marked "stale-P2" and has not received a public comment in 14 days. It is now automatically moved to P3. If you are still affected by it, you can comment and move it back to P2.
> Running Apache Beam to distribute the cleaning of a dataset in Google Cloud Dataflow
> ------------------------------------------------------------------------------------
>
> Key: BEAM-11098
> URL: https://issues.apache.org/jira/browse/BEAM-11098
> Project: Beam
> Issue Type: Bug
> Components: sdk-py-core
> Affects Versions: 2.24.0
> Environment: Ubuntu 18.04
> Python 3.6
> Reporter: Chris Rytting
> Priority: P3
> Labels: C4, T5, apache-beam, google-cloud-dataflow, tensorflow, tensorflow-datasets
> Original Estimate: 1h
> Remaining Estimate: 1h
>
> Trying to download C4 via [these instructions]([https://github.com/google-research/text-to-text-transfer-transformer#c4)] and 3 hours into my job I get this. Can't find any help on google for this error.
>
> Traceback (most recent call last):
> File "/usr/local/lib/python3.6/site-packages/dataflow_worker/batchworker.py", line 649, in do_work
> work_executor.execute()
> File "/usr/local/lib/python3.6/site-packages/dataflow_worker/executor.py", line 179, in execute
> op.start()
> File "dataflow_worker/shuffle_operations.py", line 63, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
> File "dataflow_worker/shuffle_operations.py", line 64, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
> File "dataflow_worker/shuffle_operations.py", line 79, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
> File "dataflow_worker/shuffle_operations.py", line 80, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
> File "dataflow_worker/shuffle_operations.py", line 84, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
> File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
> File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
> File "dataflow_worker/shuffle_operations.py", line 261, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
> File "dataflow_worker/shuffle_operations.py", line 268, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
> File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
> File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
> File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
> File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
> File "apache_beam/runners/common.py", line 1215, in apache_beam.runners.common.DoFnRunner.process
> File "apache_beam/runners/common.py", line 1279, in apache_beam.runners.common.DoFnRunner._reraise_augmented
> File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
> File "apache_beam/runners/common.py", line 569, in apache_beam.runners.common.SimpleInvoker.invoke_process
> File "apache_beam/runners/common.py", line 1371, in apache_beam.runners.common._OutputProcessor.process_outputs
> File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
> File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
> File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
> File "apache_beam/runners/common.py", line 1215, in apache_beam.runners.common.DoFnRunner.process
> File "apache_beam/runners/common.py", line 1294, in apache_beam.runners.common.DoFnRunner._reraise_augmented
> File "/usr/local/lib/python3.6/site-packages/future/utils/__init__.py", line 446, in raise_with_traceback
> raise exc.with_traceback(traceback)
> File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
> File "apache_beam/runners/common.py", line 570, in apache_beam.runners.common.SimpleInvoker.invoke_process
> File "/mnt/pccfs/backed_up/crytting/persuasion/createc4/lib/python3.6/site-packages/apache_beam/transforms/core.py", line 815, in <lambda>
> self.process = lambda element: fn(element)
> TypeError: clean_page() got an unexpected keyword argument 'badwords_regex' [while running 'clean_pages']
--
This message was sent by Atlassian Jira
(v8.3.4#803005)