You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "off99555 (via GitHub)" <gi...@apache.org> on 2023/04/18 10:26:10 UTC
[GitHub] [arrow] off99555 commented on issue #34455: [Python] ArrowNotImplementedError: concatenation of extension>
off99555 commented on issue #34455:
URL: https://github.com/apache/arrow/issues/34455#issuecomment-1512832822
I also have this error when calling `Dataset.push_to_hub()` with a dataset that has more than one shard.
```
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5311, in Dataset.push_to_hub(self, repo_id, split, private, token, branch, max_shard_size, num_shards, embed_external_files)
5306 if max_shard_size is not None and num_shards is not None:
5307 raise ValueError(
5308 "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both."
5309 )
-> 5311 repo_id, split, uploaded_size, dataset_nbytes, repo_files, deleted_size = self._push_parquet_shards_to_hub(
5312 repo_id=repo_id,
5313 split=split,
5314 private=private,
5315 token=token,
5316 branch=branch,
5317 max_shard_size=max_shard_size,
5318 num_shards=num_shards,
5319 embed_external_files=embed_external_files,
5320 )
5321 organization, dataset_name = repo_id.split("/")
5322 info_to_dump = self.info.copy()
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5194, in Dataset._push_parquet_shards_to_hub(self, repo_id, split, private, token, branch, max_shard_size, num_shards, embed_external_files)
5192 uploaded_size = 0
5193 shards_path_in_repo = []
-> 5194 for index, shard in logging.tqdm(
5195 enumerate(itertools.chain([first_shard], shards_iter)),
5196 desc="Pushing dataset shards to the dataset hub",
5197 total=num_shards,
5198 disable=not logging.is_progress_bar_enabled(),
5199 ):
5200 shard_path_in_repo = path_in_repo(index, shard)
5201 # Upload a shard only if it doesn't already exist in the repository
File /usr/local/lib/python3.10/dist-packages/tqdm/notebook.py:254, in tqdm_notebook.__iter__(self)
252 try:
253 it = super(tqdm_notebook, self).__iter__()
--> 254 for obj in it:
255 # return super(tqdm...) will not catch exception
256 yield obj
257 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File /usr/local/lib/python3.10/dist-packages/tqdm/std.py:1178, in tqdm.__iter__(self)
1175 time = self._time
1177 try:
-> 1178 for obj in iterable:
1179 yield obj
1180 # Update and possibly print the progressbar.
1181 # Note: does not call self.update(1) for speed optimisation.
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5169, in Dataset._push_parquet_shards_to_hub.<locals>.shards_with_embedded_external_files(shards)
5167 format = shard.format
5168 shard = shard.with_format("arrow")
-> 5169 shard = shard.map(
5170 embed_table_storage,
5171 batched=True,
5172 batch_size=1000,
5173 keep_in_memory=True,
5174 )
5175 shard = shard.with_format(**format)
5176 yield shard
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:563, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
561 self: "Dataset" = kwargs.pop("self")
562 # apply actual function
--> 563 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
564 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
565 for dataset in datasets:
566 # Remove task templates if a column mapping of the template is no longer valid
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:528, in transmit_format.<locals>.wrapper(*args, **kwargs)
521 self_format = {
522 "type": self._format_type,
523 "format_kwargs": self._format_kwargs,
524 "columns": self._format_columns,
525 "output_all_columns": self._output_all_columns,
526 }
527 # apply actual function
--> 528 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
529 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
530 # re-apply format to the output
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:3004, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
2996 if transformed_dataset is None:
2997 with logging.tqdm(
2998 disable=not logging.is_progress_bar_enabled(),
2999 unit=" examples",
(...)
3002 desc=desc or "Map",
3003 ) as pbar:
-> 3004 for rank, done, content in Dataset._map_single(**dataset_kwargs):
3005 if done:
3006 shards_done += 1
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:3395, in Dataset._map_single(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
3393 stack.enter_context(writer)
3394 if isinstance(batch, pa.Table):
-> 3395 writer.write_table(batch)
3396 else:
3397 writer.write_batch(batch)
File /usr/local/lib/python3.10/dist-packages/datasets/arrow_writer.py:567, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
565 if self.pa_writer is None:
566 self._build_writer(inferred_schema=pa_table.schema)
--> 567 pa_table = pa_table.combine_chunks()
568 pa_table = table_cast(pa_table, self._schema)
569 if self.embed_local_files:
File /usr/local/lib/python3.10/dist-packages/pyarrow/table.pxi:3315, in pyarrow.lib.Table.combine_chunks()
File /usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File /usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi:121, in pyarrow.lib.check_status()
ArrowNotImplementedError: concatenation of extension<arrow.py_extension_type<Array2DExtensionType>>
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org