You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/06/06 18:14:05 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6563: Avoid per-batch field lookups in SchemaMapping

alamb commented on code in PR #6563:
URL: https://github.com/apache/arrow-datafusion/pull/6563#discussion_r1220099753


##########
datafusion/core/src/physical_plan/file_format/mod.rs:
##########
@@ -477,44 +444,32 @@ impl SchemaAdapter {
 pub struct SchemaMapping {
     /// The schema of the table. This is the expected schema after conversion and it should match the schema of the query result.
     table_schema: SchemaRef,
-    /// In `field_mappings`, a `true` value indicates that the corresponding field in `table_schema` exists in `file_schema`,
-    /// while a `false` value indicates that the corresponding field does not exist.
-    field_mappings: Vec<bool>,
+    /// Mapping from field index in `table_schema` to index in projected file_schema
+    field_mappings: Vec<Option<usize>>,
 }
 
 impl SchemaMapping {
     /// Adapts a `RecordBatch` to match the `table_schema` using the stored mapping and conversions.
     fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
         let batch_rows = batch.num_rows();
         let batch_cols = batch.columns().to_vec();
-        let batch_schema = batch.schema();
 
         let cols = self
             .table_schema
             .fields()
             .iter()
-            .enumerate()
-            .map(|(idx, field)| {
-                if self.field_mappings[idx] {
-                    match batch_schema.index_of(field.name()) {
-                        Ok(batch_idx) => arrow::compute::cast(
-                            &batch_cols[batch_idx],
-                            field.data_type(),
-                        )
-                        .map_err(DataFusionError::ArrowError),
-                        Err(_) => Ok(new_null_array(field.data_type(), batch_rows)),
-                    }
-                } else {
-                    Ok(new_null_array(field.data_type(), batch_rows))
-                }
+            .zip(&self.field_mappings)
+            .map(|(field, file_idx)| match file_idx {
+                Some(batch_idx) => cast(&batch_cols[*batch_idx], field.data_type()),
+                None => Ok(new_null_array(field.data_type(), batch_rows)),

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org