You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/04/20 12:13:23 UTC

[arrow-datafusion] branch main updated: Clean up rustdoc and add doc lint (#6044)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new d7ca9773f6 Clean up rustdoc and add doc lint (#6044)
d7ca9773f6 is described below

commit d7ca9773f697db0f12f22c60ac7dc142fbefe5d1
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Apr 20 08:13:16 2023 -0400

    Clean up rustdoc and add doc lint (#6044)
    
    * Fix rustdoc errors
    
    * Add gitub workflow action to check docs
    
    * improve names
    
    * fix yaml
    
    * Update user doc
    
    * clean up comments
    
    * Apply suggestions from code review
    
    Co-authored-by: Ruihang Xia <wa...@gmail.com>
    
    ---------
    
    Co-authored-by: Ruihang Xia <wa...@gmail.com>
---
 .github/workflows/rust.yml                         | 22 ++++++++++++--
 datafusion/common/src/config.rs                    |  6 ++++
 datafusion/common/src/tree_node.rs                 | 35 +++++++++++++++-------
 datafusion/common/src/utils.rs                     |  2 +-
 datafusion/core/src/lib.rs                         |  2 +-
 .../src/physical_optimizer/pipeline_checker.rs     |  3 ++
 datafusion/core/src/physical_optimizer/pruning.rs  |  2 +-
 .../core/src/physical_optimizer/sort_pushdown.rs   |  2 ++
 .../src/physical_plan/aggregates/no_grouping.rs    |  4 ++-
 .../src/physical_plan/file_format/file_stream.rs   |  6 ++--
 .../core/src/physical_plan/file_format/mod.rs      |  2 ++
 .../core/src/physical_plan/file_format/parquet.rs  |  2 +-
 .../src/physical_plan/joins/symmetric_hash_join.rs |  2 +-
 datafusion/core/src/physical_plan/joins/utils.rs   | 11 +++----
 datafusion/core/src/physical_plan/planner.rs       | 10 +++----
 .../repartition/distributor_channels.rs            |  2 +-
 datafusion/core/src/scheduler/pipeline/mod.rs      |  1 +
 datafusion/core/src/scheduler/task.rs              | 10 ++++---
 datafusion/execution/src/config.rs                 |  4 +--
 datafusion/execution/src/object_store.rs           |  3 +-
 datafusion/expr/src/tree_node/plan.rs              | 12 ++++----
 datafusion/expr/src/utils.rs                       |  4 +--
 datafusion/jit/src/api.rs                          |  6 ++--
 .../optimizer/src/analyzer/count_wildcard_rule.rs  |  3 +-
 datafusion/optimizer/src/utils.rs                  |  1 -
 datafusion/physical-expr/src/aggregate/mod.rs      |  5 ++--
 datafusion/physical-expr/src/aggregate/utils.rs    | 11 ++++---
 .../physical-expr/src/expressions/in_list.rs       |  7 +++--
 datafusion/physical-expr/src/sort_expr.rs          |  2 ++
 datafusion/sql/src/expr/arrow_cast.rs              |  5 ++--
 datafusion/sql/src/expr/mod.rs                     |  3 +-
 docs/source/user-guide/configs.md                  |  2 +-
 32 files changed, 127 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index b28bdf3513..36a620aaa0 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -136,9 +136,9 @@ jobs:
       - name: Verify Working Directory Clean
         run: git diff --exit-code
 
-  # Run doc tests
+  # Run `cargo test doc` (test documentation examples)
   linux-test-doc:
-    name: cargo doctest (amd64)
+    name: cargo test doc (amd64)
     needs: [ linux-build-lib ]
     runs-on: ubuntu-latest
     container:
@@ -157,6 +157,24 @@ jobs:
       - name: Verify Working Directory Clean
         run: git diff --exit-code
 
+  # Run `cargo doc` to ensure the rustdoc is clean
+  linux-rustdoc:
+    name: cargo doc
+    needs: [ linux-build-lib ]
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Run cargo doc
+        run: |
+          export RUSTDOCFLAGS="-D warnings -A rustdoc::private-intra-doc-links"
+          cargo doc --document-private-items --no-deps --workspace
+
   # verify that the benchmark queries return the correct results
   verify-benchmark-results:
     name: verify benchmark results (amd64)
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 5973bf262e..b808f340d7 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -306,13 +306,19 @@ config_namespace! {
         /// Should DataFusion execute sorts in a per-partition fashion and merge
         /// afterwards instead of coalescing first and sorting globally.
         /// With this flag is enabled, plans in the form below
+        ///
+        /// ```text
         ///      "SortExec: [a@0 ASC]",
         ///      "  CoalescePartitionsExec",
         ///      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
+        /// ```
         /// would turn into the plan below which performs better in multithreaded environments
+        ///
+        /// ```text
         ///      "SortPreservingMergeExec: [a@0 ASC]",
         ///      "  SortExec: [a@0 ASC]",
         ///      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
+        /// ```
         pub repartition_sorts: bool, default = true
 
         /// When set to true, the logical plan optimizer will produce warning
diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
index 34d09fdc1f..2919d9a39c 100644
--- a/datafusion/common/src/tree_node.rs
+++ b/datafusion/common/src/tree_node.rs
@@ -15,18 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This module provides common traits for visiting or rewriting tree nodes easily.
+//! This module provides common traits for visiting or rewriting tree
+//! data structures easily.
 
 use std::sync::Arc;
 
 use crate::Result;
 
-/// Trait for tree node. It can be [`ExecutionPlan`], [`PhysicalExpr`], [`LogicalPlan`], [`Expr`], etc.
+/// Defines a visitable and rewriteable a tree node. This trait is
+/// implemented for plans ([`ExecutionPlan`] and [`LogicalPlan`]) as
+/// well as expression trees ([`PhysicalExpr`], [`Expr`]) in
+/// DataFusion
+///
+/// <!-- Since these are in the datafusion-common crate, can't use intra doc links) -->
+/// [`ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
+/// [`PhysicalExpr`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.PhysicalExpr.html
+/// [`LogicalPlan`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/enum.LogicalPlan.html
+/// [`Expr`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/expr/enum.Expr.html
 pub trait TreeNode: Sized {
-    /// Use preorder to iterate the node on the tree so that we can stop fast for some cases.
+    /// Use preorder to iterate the node on the tree so that we can
+    /// stop fast for some cases.
     ///
-    /// [`op`] can be used to collect some info from the tree node
-    ///      or do some checking for the tree node.
+    /// The `op` closure can be used to collect some info from the
+    /// tree node or do some checking for the tree node.
     fn apply<F>(&self, op: &mut F) -> Result<VisitRecursion>
     where
         F: FnMut(&Self) -> Result<VisitRecursion>,
@@ -68,7 +79,8 @@ pub trait TreeNode: Sized {
     /// children of that node will be visited, nor is post_visit
     /// called on that node. Details see [`TreeNodeVisitor`]
     ///
-    /// If using the default [`post_visit`] with nothing to do, the [`apply`] should be preferred
+    /// If using the default [`TreeNodeVisitor::post_visit`] that does
+    /// nothing, [`Self::apply`] should be preferred.
     fn visit<V: TreeNodeVisitor<N = Self>>(
         &self,
         visitor: &mut V,
@@ -152,7 +164,8 @@ pub trait TreeNode: Sized {
     /// children of that node will be visited, nor is mutate
     /// called on that node
     ///
-    /// If using the default [`pre_visit`] with [`true`] returned, the [`transform`] should be preferred
+    /// If using the default [`TreeNodeRewriter::pre_visit`] which
+    /// returns `true`, [`Self::transform`] should be preferred.
     fn rewrite<R: TreeNodeRewriter<N = Self>>(self, rewriter: &mut R) -> Result<Self> {
         let need_mutate = match rewriter.pre_visit(&self)? {
             RewriteRecursion::Mutate => return rewriter.mutate(self),
@@ -190,8 +203,8 @@ pub trait TreeNode: Sized {
 /// tree and makes it easier to add new types of tree node and
 /// algorithms.
 ///
-/// When passed to[`TreeNode::visit`], [`TreeNode::pre_visit`]
-/// and [`TreeNode::post_visit`] are invoked recursively
+/// When passed to[`TreeNode::visit`], [`TreeNodeVisitor::pre_visit`]
+/// and [`TreeNodeVisitor::post_visit`] are invoked recursively
 /// on an node tree.
 ///
 /// If an [`Err`] result is returned, recursion is stopped
@@ -239,7 +252,7 @@ pub trait TreeNodeRewriter: Sized {
     fn mutate(&mut self, node: Self::N) -> Result<Self::N>;
 }
 
-/// Controls how the [TreeNode] recursion should proceed for [`rewrite`].
+/// Controls how the [`TreeNode`] recursion should proceed for [`TreeNode::rewrite`].
 #[derive(Debug)]
 pub enum RewriteRecursion {
     /// Continue rewrite this node tree.
@@ -252,7 +265,7 @@ pub enum RewriteRecursion {
     Skip,
 }
 
-/// Controls how the [TreeNode] recursion should proceed for [`visit`].
+/// Controls how the [`TreeNode`] recursion should proceed for [`TreeNode::visit`].
 #[derive(Debug)]
 pub enum VisitRecursion {
     /// Continue the visit to this node tree.
diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs
index 2f468ff9c3..e29fd7e021 100644
--- a/datafusion/common/src/utils.rs
+++ b/datafusion/common/src/utils.rs
@@ -263,7 +263,7 @@ pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<Ident>> {
     Ok(idents)
 }
 
-/// Construct a new Vec<ArrayRef> from the rows of the `arrays` at the `indices`.
+/// Construct a new [`Vec`] of [`ArrayRef`] from the rows of the `arrays` at the `indices`.
 pub fn get_arrayref_at_indices(
     arrays: &[ArrayRef],
     indices: &PrimitiveArray<UInt32Type>,
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 94f7f9e39b..d145729cf7 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -150,7 +150,7 @@
 //! [`AggregateUDF`]: physical_plan::udaf::AggregateUDF
 //! [`QueryPlanner`]: execution::context::QueryPlanner
 //! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule
-//! [`PhysicalOptimizerRule`]: datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule
+//! [`PhysicalOptimizerRule`]: crate::physical_optimizer::optimizer::PhysicalOptimizerRule
 //!
 //! # Code Organization
 //!
diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs
index 1dbc59a63a..1421e25be7 100644
--- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs
+++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs
@@ -158,6 +158,9 @@ pub fn check_finiteness_requirements(
 /// data pruning. For this to be possible, it needs to have a filter where
 /// all involved [`PhysicalExpr`]s, [`Operator`]s and data types support
 /// interval calculations.
+///
+/// [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr
+/// [`Operator`]: datafusion_expr::Operator
 fn is_prunable(join: &SymmetricHashJoinExec) -> bool {
     join.filter().map_or(false, |filter| {
         check_support(filter.expression())
diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index a30e0d9111..5fe3a72c69 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -736,7 +736,7 @@ fn build_is_null_column_expr(
 /// expression that will evaluate to FALSE if it can be determined no
 /// rows between the min/max values could pass the predicates.
 ///
-/// Returns the pruning predicate as an [`Expr`]
+/// Returns the pruning predicate as an [`PhysicalExpr`]
 fn build_predicate_expression(
     expr: &Arc<dyn PhysicalExpr>,
     schema: &Schema,
diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
index 94b361cb95..f9ca976299 100644
--- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
@@ -37,6 +37,8 @@ use std::sync::Arc;
 /// This is a "data class" we use within the [`EnforceSorting`] rule to push
 /// down [`SortExec`] in the plan. In some cases, we can reduce the total
 /// computational cost by pushing down `SortExec`s through some executors.
+///
+/// [`EnforceSorting`]: crate::physical_optimizer::sort_enforcement::EnforceSorting
 #[derive(Debug, Clone)]
 pub(crate) struct SortPushDown {
     /// Current plan
diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs
index 101b157a36..7e4d0794b7 100644
--- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs
+++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs
@@ -47,7 +47,9 @@ pub(crate) struct AggregateStream {
 ///
 /// This is wrapped into yet another struct because we need to interact with the async memory management subsystem
 /// during poll. To have as little code "weirdness" as possible, we chose to just use [`BoxStream`] together with
-/// [`futures::stream::unfold`]. The latter requires a state object, which is [`GroupedHashAggregateStreamV2Inner`].
+/// [`futures::stream::unfold`].
+///
+/// The latter requires a state object, which is [`AggregateStreamInner`].
 struct AggregateStreamInner {
     schema: SchemaRef,
     mode: AggregateMode,
diff --git a/datafusion/core/src/physical_plan/file_format/file_stream.rs b/datafusion/core/src/physical_plan/file_format/file_stream.rs
index 06e4f22283..538798ead1 100644
--- a/datafusion/core/src/physical_plan/file_format/file_stream.rs
+++ b/datafusion/core/src/physical_plan/file_format/file_stream.rs
@@ -97,13 +97,13 @@ enum FileStreamState {
     /// Currently performing asynchronous IO to obtain a stream of RecordBatch
     /// for a given parquet file
     Open {
-        /// A [`FileOpenFuture`] returned by [`FormatReader::open`]
+        /// A [`FileOpenFuture`] returned by [`FileOpener::open`]
         future: FileOpenFuture,
         /// The partition values for this file
         partition_values: Vec<ScalarValue>,
     },
     /// Scanning the [`BoxStream`] returned by the completion of a [`FileOpenFuture`]
-    /// returned by [`FormatReader::open`]
+    /// returned by [`FileOpener::open`]
     Scan {
         /// Partitioning column values for the current batch_iter
         partition_values: Vec<ScalarValue>,
@@ -149,7 +149,7 @@ impl StartableTime {
 struct FileStreamMetrics {
     /// Wall clock time elapsed for file opening.
     ///
-    /// Time between when [`FileReader::open`] is called and when the
+    /// Time between when [`FileOpener::open`] is called and when the
     /// [`FileStream`] receives a stream for reading.
     ///
     /// If there are multiple files being scanned, the stream
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index a50f3bf025..5fffc628af 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -77,6 +77,8 @@ use super::{ColumnStatistics, Statistics};
 /// different dictionary type.
 ///
 /// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say.
+///
+/// [`ListingTable`]: crate::datasource::listing::ListingTable
 pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
     DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
 }
diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs
index c69cdb7417..6d9e35c32e 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet.rs
@@ -462,7 +462,7 @@ fn make_output_ordering_string(ordering: &[PhysicalSortExpr]) -> String {
     w
 }
 
-/// Implements [`FormatReader`] for a parquet file
+/// Implements [`FileOpener`] for a parquet file
 struct ParquetOpener {
     partition_index: usize,
     projection: Arc<[usize]>,
diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs
index 78c9aca515..0cfb79141e 100644
--- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs
+++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs
@@ -263,7 +263,7 @@ impl SymmetricHashJoinExec {
     /// # Error
     /// This function errors when:
     /// - It is not possible to join the left and right sides on keys `on`, or
-    /// - It fails to construct [SortedFilterExpr]s, or
+    /// - It fails to construct `SortedFilterExpr`s, or
     /// - It fails to create the [ExprIntervalGraph].
     pub fn try_new(
         left: Arc<dyn ExecutionPlan>,
diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs
index 9e25e03537..412e774806 100644
--- a/datafusion/core/src/physical_plan/joins/utils.rs
+++ b/datafusion/core/src/physical_plan/joins/utils.rs
@@ -735,14 +735,15 @@ pub(crate) fn need_produce_result_in_final(join_type: JoinType) -> bool {
     )
 }
 
-/// In the end of join execution, need to use bit map of the matched indices to generate the final left and
-/// right indices.
+/// In the end of join execution, need to use bit map of the matched
+/// indices to generate the final left and right indices.
 ///
 /// For example:
-/// left_bit_map: [true, false, true, true, false]
-/// join_type: `Left`
 ///
-/// The result is: ([1,4], [null, null])
+/// 1. left_bit_map: `[true, false, true, true, false]`
+/// 2. join_type: `Left`
+///
+/// The result is: `([1,4], [null, null])`
 pub(crate) fn get_final_indices_from_bit_map(
     left_bit_map: &BooleanBufferBuilder,
     join_type: JoinType,
diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
index 8ee32b8d04..4bad535e52 100644
--- a/datafusion/core/src/physical_plan/planner.rs
+++ b/datafusion/core/src/physical_plan/planner.rs
@@ -1292,8 +1292,8 @@ impl DefaultPhysicalPlanner {
     }
 }
 
-/// Expand and align  a GROUPING SET expression.
-/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS)
+/// Expand and align a GROUPING SET expression.
+/// (see <https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS>)
 ///
 /// This will take a list of grouping sets and ensure that each group is
 /// properly aligned for the physical execution plan. We do this by
@@ -1301,7 +1301,7 @@ impl DefaultPhysicalPlanner {
 /// group to the same set of expression types and ordering.
 /// For example, if we have something like `GROUPING SETS ((a,b,c),(a),(b),(b,c))`
 /// we would expand this to `GROUPING SETS ((a,b,c),(a,NULL,NULL),(NULL,b,NULL),(NULL,b,c))
-/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS)
+/// (see <https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS>)
 fn merge_grouping_set_physical_expr(
     grouping_sets: &[Vec<Expr>],
     input_dfschema: &DFSchema,
@@ -1352,7 +1352,7 @@ fn merge_grouping_set_physical_expr(
 }
 
 /// Expand and align a CUBE expression. This is a special case of GROUPING SETS
-/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS)
+/// (see <https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS>)
 fn create_cube_physical_expr(
     exprs: &[Expr],
     input_dfschema: &DFSchema,
@@ -1399,7 +1399,7 @@ fn create_cube_physical_expr(
 }
 
 /// Expand and align a ROLLUP expression. This is a special case of GROUPING SETS
-/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS)
+/// (see <https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS>)
 fn create_rollup_physical_expr(
     exprs: &[Expr],
     input_dfschema: &DFSchema,
diff --git a/datafusion/core/src/physical_plan/repartition/distributor_channels.rs b/datafusion/core/src/physical_plan/repartition/distributor_channels.rs
index 412926fbc6..d9466d647c 100644
--- a/datafusion/core/src/physical_plan/repartition/distributor_channels.rs
+++ b/datafusion/core/src/physical_plan/repartition/distributor_channels.rs
@@ -85,7 +85,7 @@ pub fn channels<T>(
 
 /// Erroring during [send](DistributionSender::send).
 ///
-/// This occurs when the [receiver](DistributedReceiver) is gone.
+/// This occurs when the [receiver](DistributionReceiver) is gone.
 #[derive(PartialEq, Eq)]
 pub struct SendError<T>(pub T);
 
diff --git a/datafusion/core/src/scheduler/pipeline/mod.rs b/datafusion/core/src/scheduler/pipeline/mod.rs
index 824a6950e0..c1838cd1a4 100644
--- a/datafusion/core/src/scheduler/pipeline/mod.rs
+++ b/datafusion/core/src/scheduler/pipeline/mod.rs
@@ -70,6 +70,7 @@ pub mod repartition;
 /// from the physical plan as a single [`Pipeline`]. Parallelized implementations
 /// are also possible
 ///
+/// [`ExecutionPlan`]: crate::physical_plan::ExecutionPlan
 pub trait Pipeline: Send + Sync + std::fmt::Debug {
     /// Push a [`RecordBatch`] to the given input partition
     fn push(&self, input: RecordBatch, child: usize, partition: usize) -> Result<()>;
diff --git a/datafusion/core/src/scheduler/task.rs b/datafusion/core/src/scheduler/task.rs
index ffa9728a4b..bf10186ef2 100644
--- a/datafusion/core/src/scheduler/task.rs
+++ b/datafusion/core/src/scheduler/task.rs
@@ -128,7 +128,8 @@ impl Task {
         }
     }
 
-    /// Call [`Pipeline::poll_partition`], attempting to make progress on query execution
+    /// Call [`Pipeline::poll_partition`][super::pipeline::Pipeline::poll_partition],
+    /// attempting to make progress on query execution
     pub fn do_work(self) {
         assert!(is_worker(), "Task::do_work called outside of worker pool");
         if self.context.is_cancelled() {
@@ -324,7 +325,9 @@ impl ExecutionContext {
 
 struct TaskWaker {
     /// Store a weak reference to the [`ExecutionContext`] to avoid reference cycles if this
-    /// [`Waker`] is stored within a [`Pipeline`] owned by the [`ExecutionContext`]
+    /// [`Waker`] is stored within a `Pipeline` owned by the [`ExecutionContext`]
+    ///
+    /// [`Waker`]: std::task::Waker
     context: Weak<ExecutionContext>,
 
     /// A counter that stores the number of times this has been awoken
@@ -338,9 +341,8 @@ struct TaskWaker {
     /// This ensures that a given [`Task`] is not enqueued multiple times
     ///
     /// We store an integer, as opposed to a boolean, so that wake ups that
-    /// occur during [`Pipeline::poll_partition`] can be detected and handled
+    /// occur during `Pipeline::poll_partition` can be detected and handled
     /// after it has finished executing
-    ///
     wake_count: AtomicUsize,
 
     /// The index of the pipeline within `query` to poll
diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
index 8f0094fadf..81b7e8f0a6 100644
--- a/datafusion/execution/src/config.rs
+++ b/datafusion/execution/src/config.rs
@@ -94,7 +94,7 @@ impl SessionConfig {
 
     /// Customize [`target_partitions`]
     ///
-    /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
+    /// [`target_partitions`]: datafusion_common::config::ExecutionOptions::target_partitions
     pub fn with_target_partitions(mut self, n: usize) -> Self {
         // partition count must be greater than zero
         assert!(n > 0);
@@ -104,7 +104,7 @@ impl SessionConfig {
 
     /// Get [`target_partitions`]
     ///
-    /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
+    /// [`target_partitions`]: datafusion_common::config::ExecutionOptions::target_partitions
     pub fn target_partitions(&self) -> usize {
         self.options.execution.target_partitions
     }
diff --git a/datafusion/execution/src/object_store.rs b/datafusion/execution/src/object_store.rs
index c22ec34a60..d5b7e48d6f 100644
--- a/datafusion/execution/src/object_store.rs
+++ b/datafusion/execution/src/object_store.rs
@@ -117,7 +117,8 @@ impl std::fmt::Display for ObjectStoreUrl {
 /// 2. Systems relying on ad-hoc discovery, without corresponding DDL, can create [`ObjectStore`]
 /// lazily by providing a custom implementation of [`ObjectStoreRegistry`]
 ///
-/// [`ListingTableUrl`]: crate::datasource::listing::ListingTableUrl
+/// <!-- is in a different crate so normal rustdoc links don't work -->
+/// [`ListingTableUrl`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTableUrl.html
 pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static {
     /// If a store with the same key existed before, it is replaced and returned
     fn register_store(
diff --git a/datafusion/expr/src/tree_node/plan.rs b/datafusion/expr/src/tree_node/plan.rs
index e948f90da8..c7621bc178 100644
--- a/datafusion/expr/src/tree_node/plan.rs
+++ b/datafusion/expr/src/tree_node/plan.rs
@@ -22,12 +22,14 @@ use datafusion_common::tree_node::{TreeNodeVisitor, VisitRecursion};
 use datafusion_common::{tree_node::TreeNode, Result};
 
 impl TreeNode for LogicalPlan {
-    /// Compared to the default implementation, we need to invoke [`apply_subqueries`]
-    /// before visiting its children
     fn apply<F>(&self, op: &mut F) -> Result<VisitRecursion>
     where
         F: FnMut(&Self) -> Result<VisitRecursion>,
     {
+        // Note,
+        //
+        // Compared to the default implementation, we need to invoke
+        // [`Self::apply_subqueries`] before visiting its children
         match op(self)? {
             VisitRecursion::Continue => {}
             // If the recursion should skip, do not apply to its children. And let the recursion continue
@@ -61,13 +63,13 @@ impl TreeNode for LogicalPlan {
     /// visitor.post_visit(Filter)
     /// visitor.post_visit(Projection)
     /// ```
-    ///
-    /// Compared to the default implementation, we need to invoke [`visit_subqueries`]
-    /// before visiting its children
     fn visit<V: TreeNodeVisitor<N = Self>>(
         &self,
         visitor: &mut V,
     ) -> Result<VisitRecursion> {
+        // Compared to the default implementation, we need to invoke
+        // [`Self::visit_subqueries`] before visiting its children
+
         match visitor.pre_visit(self)? {
             VisitRecursion::Continue => {}
             // If the recursion should skip, do not apply to its children. And let the recursion continue
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index a61e79fc24..3aec826d81 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -69,7 +69,7 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
     }
 }
 
-/// The power set (or powerset) of a set S is the set of all subsets of S, \
+/// The [power set] (or powerset) of a set S is the set of all subsets of S, \
 /// including the empty set and S itself.
 ///
 /// Example:
@@ -85,7 +85,7 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
 ///  {x, y, z} \
 ///  and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, z}, {x, y, z}}.
 ///
-/// Reference: https://en.wikipedia.org/wiki/Power_set
+/// [power set]: https://en.wikipedia.org/wiki/Power_set
 fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
     if slice.len() >= 64 {
         return Err("The size of the set must be less than 64.".into());
diff --git a/datafusion/jit/src/api.rs b/datafusion/jit/src/api.rs
index a3989b5e5a..4ac26438bc 100644
--- a/datafusion/jit/src/api.rs
+++ b/datafusion/jit/src/api.rs
@@ -274,7 +274,7 @@ impl<'a> CodeBlock<'a> {
         unreachable!()
     }
 
-    /// Enter else block. Try [if_block] first which is much easier to use.
+    /// Enter else block. Try [`Self::if_block`] first which is much easier to use.
     fn enter_else(&mut self) {
         self.fields.pop_back();
         self.fields.push_back(HashMap::new());
@@ -367,7 +367,7 @@ impl<'a> CodeBlock<'a> {
         Ok(())
     }
 
-    /// Enter `while` loop block. Try [while_block] first which is much easier to use.
+    /// Enter `while` loop block. Try [`Self::while_block`] first which is much easier to use.
     fn while_loop(&mut self, cond: Expr) -> Result<CodeBlock> {
         if cond.get_type() != BOOL {
             internal_err!("while condition must be bool")
@@ -384,7 +384,7 @@ impl<'a> CodeBlock<'a> {
         }
     }
 
-    /// Enter `if-then-else`'s then block. Try [if_block] first which is much easier to use.
+    /// Enter `if-then-else`'s then block. Try [`Self::if_block`] first which is much easier to use.
     fn if_else(&mut self, cond: Expr) -> Result<CodeBlock> {
         if cond.get_type() != BOOL {
             internal_err!("if condition must be bool")
diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs
index ed48da7fde..4520141de5 100644
--- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs
+++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs
@@ -33,7 +33,8 @@ use crate::analyzer::AnalyzerRule;
 pub const COUNT_STAR: &str = "COUNT(*)";
 
 /// Rewrite `Count(Expr:Wildcard)` to `Count(Expr:Literal)`.
-/// Resolve issue: https://github.com/apache/arrow-datafusion/issues/5473.
+///
+/// Resolves issue: <https://github.com/apache/arrow-datafusion/issues/5473>
 #[derive(Default)]
 pub struct CountWildcardRule {}
 
diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs
index b52f1b1146..4e5fa8cd6a 100644
--- a/datafusion/optimizer/src/utils.rs
+++ b/datafusion/optimizer/src/utils.rs
@@ -487,7 +487,6 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema {
 /// from both the subquery and outer table or only from the outer table.
 ///
 /// Returns join predicates and subquery(extracted).
-/// ```
 pub(crate) fn extract_join_filters(
     maybe_filter: &LogicalPlan,
 ) -> Result<(Vec<Expr>, LogicalPlan)> {
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
index 691b708162..b3e37a8f92 100644
--- a/datafusion/physical-expr/src/aggregate/mod.rs
+++ b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -57,8 +57,9 @@ pub(crate) mod variance;
 /// * knows its accumulator's state's field
 /// * knows the expressions from whose its accumulator will receive values
 ///
-/// The aggregate expressions implement this trait also need to implement the [PartialEq<dyn Any>]
-/// This allows comparing the equality between the trait objects
+/// Any implementation of this trait also needs to implement the
+/// `PartialEq<dyn Any>` to allows comparing equality between the
+/// trait objects.
 pub trait AggregateExpr: Send + Sync + Debug + PartialEq<dyn Any> {
     /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be
     /// downcast to a specific implementation.
diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs
index feabbd9cdc..158ceb316e 100644
--- a/datafusion/physical-expr/src/aggregate/utils.rs
+++ b/datafusion/physical-expr/src/aggregate/utils.rs
@@ -83,10 +83,13 @@ pub fn calculate_result_decimal_for_avg(
     }
 }
 
-/// Downcast the [`Box<dyn AggregateExpr>`] or [`Arc<dyn AggregateExpr>`] and return
-/// the inner trait object as [`Any`](std::any::Any) so that it can be downcast to a specific implementation.
-/// This method is used when implementing the [`PartialEq<dyn Any>'] for aggregation expressions and allows
-/// comparing the equality between the trait objects.
+/// Downcast a `Box<dyn AggregateExpr>` or `Arc<dyn AggregateExpr>`
+/// and return the inner trait object as [`Any`](std::any::Any) so
+/// that it can be downcast to a specific implementation.
+///
+/// This method is used when implementing the `PartialEq<dyn Any>`
+/// for [`AggregateExpr`] aggregation expressions and allows comparing the equality
+/// between the trait objects.
 pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
     if any.is::<Arc<dyn AggregateExpr>>() {
         any.downcast_ref::<Arc<dyn AggregateExpr>>()
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index c4d5bb9b5f..575050cfff 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! InList expression
+//! Implementation of `InList` expressions: [`InListExpr`]
 
 use ahash::RandomState;
 use std::any::Any;
@@ -134,8 +134,9 @@ where
     }
 }
 
-/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there are nulls present
-/// or there are more than [`OPTIMIZER_INSET_THRESHOLD`] values
+/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there
+/// are nulls present or there are more than the configured number of
+/// elements.
 ///
 /// Note: This is split into a separate function as higher-rank trait bounds currently
 /// cause type inference to misbehave
diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs
index 1699d25bf2..f19ae67004 100644
--- a/datafusion/physical-expr/src/sort_expr.rs
+++ b/datafusion/physical-expr/src/sort_expr.rs
@@ -180,6 +180,8 @@ impl PhysicalSortRequirement {
     ///
     /// This method takes `&'a PhysicalSortExpr` to make it easy to
     /// use implementing [`ExecutionPlan::required_input_ordering`].
+    ///
+    /// [`ExecutionPlan::required_input_ordering`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.required_input_ordering
     pub fn from_sort_exprs<'a>(
         ordering: impl IntoIterator<Item = &'a PhysicalSortExpr>,
     ) -> Vec<PhysicalSortRequirement> {
diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs
index 043d078d8d..607a9fb432 100644
--- a/datafusion/sql/src/expr/arrow_cast.rs
+++ b/datafusion/sql/src/expr/arrow_cast.rs
@@ -29,8 +29,8 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast";
 
 /// Create an [`Expr`] that evaluates the `arrow_cast` function
 ///
-/// This function is not a [`BuiltInScalarFunction`] because the
-/// return type of [`BuiltInScalarFunction`] depends only on the
+/// This function is not a [`BuiltinScalarFunction`] because the
+/// return type of [`BuiltinScalarFunction`] depends only on the
 /// *types* of the arguments. However, the type of `arrow_type` depends on
 /// the *value* of its second argument.
 ///
@@ -48,6 +48,7 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast";
 /// ```sql
 /// select arrow_cast(column_x, 'Float64')
 /// ```
+/// [`BuiltinScalarFunction`]: datafusion_expr::BuiltinScalarFunction
 pub fn create_arrow_cast(mut args: Vec<Expr>, schema: &DFSchema) -> Result<Expr> {
     if args.len() != 2 {
         return Err(DataFusionError::Plan(format!(
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index 4a8719331a..67fe8b1e99 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -518,7 +518,8 @@ fn rewrite_placeholder(expr: &mut Expr, other: &Expr, schema: &DFSchema) -> Resu
     Ok(())
 }
 
-/// Find all [`Expr::PlaceHolder`] tokens in a logical plan, and try to infer their type from context
+/// Find all [`Expr::Placeholder`] tokens in a logical plan, and try
+/// to infer their [`DataType`] from the context of their use.
 fn infer_placeholder_types(expr: Expr, schema: &DFSchema) -> Result<Expr> {
     expr.transform(&|mut expr| {
         // Default to assuming the arguments are the same type
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index dc21c81942..4b754bcd8a 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -63,7 +63,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.optimizer.allow_symmetric_joins_without_pruning | true       | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in  [...]
 | datafusion.optimizer.repartition_file_scans                | true       | When set to true, file groups will be repartitioned to achieve maximum parallelism. Currently supported only for Parquet format in which case multiple row groups from the same file may be read concurrently. If false then each row group is read serially, though different files may be read in parallel.                                                                                                                    [...]
 | datafusion.optimizer.repartition_windows                   | true       | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                [...]
-| datafusion.optimizer.repartition_sorts                     | true       | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", would turn into the plan below which performs better in multithreaded environments "SortPreservingMergeExec: [a@ [...]
+| datafusion.optimizer.repartition_sorts                     | true       | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreserving [...]
 | datafusion.optimizer.skip_failed_rules                     | true       | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                             [...]
 | datafusion.optimizer.max_passes                            | 3          | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                             [...]
 | datafusion.optimizer.top_down_join_key_reordering          | true       | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                               [...]