You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "off99555 (via GitHub)" <gi...@apache.org> on 2023/04/18 10:26:10 UTC

[GitHub] [arrow] off99555 commented on issue #34455: [Python] ArrowNotImplementedError: concatenation of extension>

off99555 commented on issue #34455:
URL: https://github.com/apache/arrow/issues/34455#issuecomment-1512832822

   I also have this error when calling `Dataset.push_to_hub()` with a dataset that has more than one shard.
   ```
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5311, in Dataset.push_to_hub(self, repo_id, split, private, token, branch, max_shard_size, num_shards, embed_external_files)
      5306 if max_shard_size is not None and num_shards is not None:
      5307     raise ValueError(
      5308         "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both."
      5309     )
   -> 5311 repo_id, split, uploaded_size, dataset_nbytes, repo_files, deleted_size = self._push_parquet_shards_to_hub(
      5312     repo_id=repo_id,
      5313     split=split,
      5314     private=private,
      5315     token=token,
      5316     branch=branch,
      5317     max_shard_size=max_shard_size,
      5318     num_shards=num_shards,
      5319     embed_external_files=embed_external_files,
      5320 )
      5321 organization, dataset_name = repo_id.split("/")
      5322 info_to_dump = self.info.copy()
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5194, in Dataset._push_parquet_shards_to_hub(self, repo_id, split, private, token, branch, max_shard_size, num_shards, embed_external_files)
      5192 uploaded_size = 0
      5193 shards_path_in_repo = []
   -> 5194 for index, shard in logging.tqdm(
      5195     enumerate(itertools.chain([first_shard], shards_iter)),
      5196     desc="Pushing dataset shards to the dataset hub",
      5197     total=num_shards,
      5198     disable=not logging.is_progress_bar_enabled(),
      5199 ):
      5200     shard_path_in_repo = path_in_repo(index, shard)
      5201     # Upload a shard only if it doesn't already exist in the repository
   
   File /usr/local/lib/python3.10/dist-packages/tqdm/notebook.py:254, in tqdm_notebook.__iter__(self)
       252 try:
       253     it = super(tqdm_notebook, self).__iter__()
   --> 254     for obj in it:
       255         # return super(tqdm...) will not catch exception
       256         yield obj
       257 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
   
   File /usr/local/lib/python3.10/dist-packages/tqdm/std.py:1178, in tqdm.__iter__(self)
      1175 time = self._time
      1177 try:
   -> 1178     for obj in iterable:
      1179         yield obj
      1180         # Update and possibly print the progressbar.
      1181         # Note: does not call self.update(1) for speed optimisation.
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:5169, in Dataset._push_parquet_shards_to_hub.<locals>.shards_with_embedded_external_files(shards)
      5167 format = shard.format
      5168 shard = shard.with_format("arrow")
   -> 5169 shard = shard.map(
      5170     embed_table_storage,
      5171     batched=True,
      5172     batch_size=1000,
      5173     keep_in_memory=True,
      5174 )
      5175 shard = shard.with_format(**format)
      5176 yield shard
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:563, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
       561     self: "Dataset" = kwargs.pop("self")
       562 # apply actual function
   --> 563 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
       564 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
       565 for dataset in datasets:
       566     # Remove task templates if a column mapping of the template is no longer valid
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:528, in transmit_format.<locals>.wrapper(*args, **kwargs)
       521 self_format = {
       522     "type": self._format_type,
       523     "format_kwargs": self._format_kwargs,
       524     "columns": self._format_columns,
       525     "output_all_columns": self._output_all_columns,
       526 }
       527 # apply actual function
   --> 528 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
       529 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
       530 # re-apply format to the output
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:3004, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
      2996 if transformed_dataset is None:
      2997     with logging.tqdm(
      2998         disable=not logging.is_progress_bar_enabled(),
      2999         unit=" examples",
      (...)
      3002         desc=desc or "Map",
      3003     ) as pbar:
   -> 3004         for rank, done, content in Dataset._map_single(**dataset_kwargs):
      3005             if done:
      3006                 shards_done += 1
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py:3395, in Dataset._map_single(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
      3393     stack.enter_context(writer)
      3394 if isinstance(batch, pa.Table):
   -> 3395     writer.write_table(batch)
      3396 else:
      3397     writer.write_batch(batch)
   
   File /usr/local/lib/python3.10/dist-packages/datasets/arrow_writer.py:567, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
       565 if self.pa_writer is None:
       566     self._build_writer(inferred_schema=pa_table.schema)
   --> 567 pa_table = pa_table.combine_chunks()
       568 pa_table = table_cast(pa_table, self._schema)
       569 if self.embed_local_files:
   
   File /usr/local/lib/python3.10/dist-packages/pyarrow/table.pxi:3315, in pyarrow.lib.Table.combine_chunks()
   
   File /usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
   
   File /usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi:121, in pyarrow.lib.check_status()
   
   ArrowNotImplementedError: concatenation of extension<arrow.py_extension_type<Array2DExtensionType>>
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org