You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Scott Wilson (Jira)" <ji...@apache.org> on 2020/04/17 19:38:00 UTC

[jira] [Commented] (ARROW-602) [C++] Provide iterator access to primitive elements inside a Column/ChunkedArray

    [ https://issues.apache.org/jira/browse/ARROW-602?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17086017#comment-17086017 ] 

Scott Wilson commented on ARROW-602:
------------------------------------

[~wesm] I've taken a stab at implementing ChunkedArray covers and iterators to support STL interfaces. My initial implementation returned std::optional<T>, but I didn't like the way this interacted with most stl numeric algorithms which expect iterator dereferences to be double or the like, and std::optional<double> doesn't automatically cast to double. I added a IsNull() member to the iterators, so at least this info is accessible.

I'm not quite sure how to handle the issue of mutability, so I've simply punted. (Is there some description of the expected semantics of immutability? Is it really the case that I shouldn't write into a raw_value if the array is immutable?) Arrays that return proxys for the underlying memory, e.g. string, require a strange _cached variable in order to return a dereferenced iterator.

This my first pass at this code. I've done little testing, and it is not comprehensive. I just wanted to verify that I'm not doing something abnormally stupid ;)
{code:c++}
#include <cstdint>
#include <memory>
#include <numeric>
#include <string>
#include <iostream>
#include <optional>
#include <vector>
#include <xutility>

#include <arrow/api.h>
#include <arrow/filesystem/localfs.h>
#include <arrow/csv/api.h>
#include <arrow/result.h>
#include <arrow/builder.h>

#include <boost/iterator/iterator_facade.hpp>
#include <boost/range/iterator_range.hpp>

using namespace std;
using namespace arrow;

// SBW 2020.04.15 For ArrayCoverRaw::iterator, we can simply use the the pointer interface.
// Wes suggests returning std::optional<T>, but sizeof(double) < sizeof(std::optional<double>) and
// is not a drop-in replacement for T, i.e. optional<T> can't be used in expression, need optional<T>.value().

// STL container-like cover for arrow::Array.
// Only works for Array types that support raw_values().
template<typename ArrType>
class ArrayCoverRaw
{
public:
	using T = typename ArrType::value_type;
	using pointer = T*;
	using const_pointer = const T*;
	using reference = T&;
	using const_reference = const T&;
	// Match size_type to Array offsets rather than using size_t and ptrdiff_t.
	using size_type = int64_t;
	using difference_type = int64_t;
	using iterator = pointer;
	using const_iterator = const_pointer;
	using reverse_iterator = pointer;
	using const_reverse_iterator = const_pointer;

	ArrayCoverRaw(std::shared_ptr<ArrType>& array) : _array(array) {}

	size_type size() const { return _array->length(); }

	// Should non-const versions fail if Array is immutable?
	iterator begin() { return const_cast<pointer>(_array->raw_values()); }
	iterator end() { return const_cast<pointer>(_array->raw_values()+_array->length()); }
	reverse_iterator rbegin() { return const_cast<pointer>(_array->raw_values()+_array->length()-1); }
	reverse_iterator rend() { return const_cast<pointer>(_array->raw_values()-1); }
	const_iterator cbegin() const { return _array->raw_values(); }
	const_iterator cend() const { return _array->raw_values()+_array->length(); }
	const_reverse_iterator crbegin() const { return _array->raw_values()+_array->length()-1; }
	const_reverse_iterator crend() const { return _array->raw_values()-1; }

	// We could return std::optional<T> to encapsulate IsNull() info, but this would seem to break the expected semantics.
	reference operator[](const difference_type off) { assert(_array->data()->is_mutable()); return _array->raw_values()+off; }
	const_reference operator[](const difference_type off) const { return _array->raw_values()+off; }
	// ISSUE: is there an interface for setting IsNull() if array is mutable.
	bool IsNull(difference_type off) const { return _array->IsNull(off); }

protected:
	std::shared_ptr<ArrType> _array;
};

// TODO: Add ArrayCoverString and iterators, perhaps others.

// Use template on Value so we can create iterator and const_iterator by changing Value.
// Only works for Array types that support raw_values().
template <typename ArrType, class Value>
class ChunkedArrayIteratorRaw
	: public boost::iterator_facade<ChunkedArrayIteratorRaw<ArrType, Value>, Value, boost::random_access_traversal_tag>
{
public:
	using difference_type = int64_t;
	using T = typename ArrType::value_type;
	using pointer = T*;

	explicit ChunkedArrayIteratorRaw(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, difference_type pos = 0)
		: _ch_arr(ch_arr)
	{
		set_position(pos);
	}

	bool IsNull() const
	{
		auto arr = _ch_arr->chunk(_chunk_index);
		return arr->IsNull(_current-_first);
	}

private:
	friend class boost::iterator_core_access;

    bool equal(ChunkedArrayIteratorRaw<ArrType, Value> const& other) const
    {
        return this->_position == other._position;
    }

    void increment()
	{
		_position++;
		// Need to move to next chunk?
		if ((_current == _last) && ((_chunk_index+1) < _ch_arr->num_chunks()))
		{
			_chunk_index++;
			auto arr = _ch_arr->chunk(_chunk_index);
			auto typed_arr = std::static_pointer_cast<ArrType>(arr);
			_first = const_cast<pointer>(typed_arr->raw_values());
			_last = _first + arr->length() - 1;
			_current = _first;
		}
		else
		{
			_current++;
		}
	}

	void decrement()
	{
		_position--;
		// Need to move to previous chunk?
		if ((_current == _first) && (_chunk_index > 0))
		{
			_chunk_index--;
			auto arr = _ch_arr->chunk(_chunk_index);
			auto typed_arr = std::static_pointer_cast<ArrType>(arr);
			_first = const_cast<pointer>(typed_arr->raw_values());
			_last = _first + arr->length() - 1;
			_current = _last;
		}
		else
		{
			_current--;
		}
	}

    Value& dereference() const { return *_current; }

	void advance(difference_type n)
	{
		_position += n;
		while (n > 0)
		{
			difference_type max_delta = _last - _current;
			if ((max_delta >= n) || ((_chunk_index+1) == _ch_arr->num_chunks()))
			{
				_current += n;
				return;
			}
			// Move to next chunk.
			n -= max_delta;
			_chunk_index++;
			auto arr = _ch_arr->chunk(_chunk_index);
			auto typed_arr = std::static_pointer_cast<ArrType>(arr);
			_first = const_cast<pointer>(typed_arr->raw_values());
			_last = _first + arr->length() - 1;
			_current = _first;
		}
		while (n < 0)
		{
			difference_type max_delta = _first - _current;
			if ((max_delta <= n) || (_chunk_index == 0))
			{
				_current += n;
				return;
			}
			// Move to previous chunk.
			n -= max_delta;
			_chunk_index--;
			assert(_chunk_index >= 0);
			auto arr = _ch_arr->chunk(_chunk_index);
			auto typed_arr = std::static_pointer_cast<ArrType>(arr);
			_first = const_cast<pointer>(typed_arr->raw_values());
			_last = _first + arr->length() - 1;
			_current = _last;
		}
	}

	difference_type distance_to(ChunkedArrayIteratorRaw<ArrType, Value> const& other)
	{
		return other._position - this->_position;
	}

	// Helper
	void set_position(difference_type pos)
	{
		_position = pos;
		const int nchunks = _ch_arr->num_chunks();
		int64_t offset = 0;
		for (_chunk_index = 0; _chunk_index < nchunks; _chunk_index++)
		{
			auto arr = _ch_arr->chunk(_chunk_index);
			int64_t arr_rows = arr->length();
			if (((offset+arr_rows) > pos) || ((_chunk_index+1)==nchunks))
			{
				auto typed_arr = std::static_pointer_cast<ArrType>(arr);
				_first = const_cast<T*>(typed_arr->raw_values());
				_last = _first + arr_rows - 1;
				_current = _first + (pos-offset);
				return;
			}
			offset += arr_rows;
		}
		assert(false);
	}

	std::shared_ptr<arrow::ChunkedArray> _ch_arr;
	// Which Array we're looking at.
	int _chunk_index = 0;
	// Pointers into current Array. Use first/last rather than begin/end for symmetry of moving forward/backward.
	pointer _first = 0;
	pointer _current = 0;
	pointer _last = 0;
	// Cache position across all chunks for support of random access.
	difference_type _position = 0;
};

// This implementation is a subclass for Arrays that use GetView(i), GetString(i), etc.
// Concrete subclass only needs to implement dereference(i).
template <typename ArrType>
class ChunkedArrayIteratorIndexImpl
{
public:
	using difference_type = int64_t;

	explicit ChunkedArrayIteratorIndexImpl(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, difference_type pos = 0)
		: _ch_arr(ch_arr)
	{
		set_position(pos);
	}

	bool IsNull() const
	{
		auto arr = _ch_arr->chunk(_chunk_index);
		return arr->IsNull(_current);
	}

protected:
	friend class boost::iterator_core_access;

    bool equal(ChunkedArrayIteratorIndexImpl<ArrType> const& other) const
    {
        return this->_position == other._position;
    }

    void increment()
	{
		_position++;
		// Need to move to next chunk?
		if ((_current == _last) && ((_chunk_index+1) < _ch_arr->num_chunks()))
		{
			_chunk_index++;
			auto arr = _ch_arr->chunk(_chunk_index);
			_typed_arr = std::static_pointer_cast<ArrType>(arr);
			_last = arr->length() - 1;
			_current = 0;
		}
		else
		{
			_current++;
		}
	}

	void decrement()
	{
		_position--;
		// Need to move to previous chunk?
		if ((_current == _first) && (_chunk_index > 0))
		{
			_chunk_index--;
			auto arr = _ch_arr->chunk(_chunk_index);
			_typed_arr = std::static_pointer_cast<ArrType>(arr);
			_last = arr->length() - 1;
			_current = _last;
		}
		else
		{
			_current--;
		}
	}

    // Value& dereference() const { return *_current; }

	void advance(difference_type n)
	{
		_position += n;
		while (n > 0)
		{
			difference_type max_delta = _last - _current;
			if ((max_delta >= n) || ((_chunk_index+1) == _ch_arr->num_chunks()))
			{
				_current += n;
				return;
			}
			// Move to next chunk.
			n -= max_delta;
			_chunk_index++;
			auto arr = _ch_arr->chunk(_chunk_index);
			_typed_arr = std::static_pointer_cast<ArrType>(arr);
			_last = arr->length() - 1;
			_current = 0;
		}
		while (n < 0)
		{
			difference_type max_delta = 0 - _current;
			if ((max_delta <= n) || (_chunk_index == 0))
			{
				_current += n;
				return;
			}
			// Move to previous chunk.
			n -= max_delta;
			_chunk_index--;
			auto arr = _ch_arr->chunk(_chunk_index);
			_typed_arr = std::static_pointer_cast<ArrType>(arr);
			_last = arr->length() - 1;
			_current = _last;
		}
	}

	difference_type distance_to(ChunkedArrayIteratorIndexImpl<ArrType> const& other)
	{
		return other._position - this->_position;
	}

	// Helper
	void set_position(difference_type pos)
	{
		_position = pos;
		const int nchunks = _ch_arr->num_chunks();
		int64_t offset = 0;
		for (_chunk_index = 0; _chunk_index < nchunks; _chunk_index++)
		{
			auto arr = _ch_arr->chunk(_chunk_index);
			int64_t arr_rows = arr->length();
			if (((offset+arr_rows) > pos) || ((_chunk_index+1)==nchunks))
			{
				_typed_arr = std::static_pointer_cast<ArrType>(arr);
				_last = arr_rows - 1;
				_current = (pos-offset);
				return;
			}
			offset += arr_rows;
		}
		assert(false);
	}

	std::shared_ptr<arrow::ChunkedArray> _ch_arr;
	// Which Array we're looking at.
	int _chunk_index = 0;
	// Current Array. Use first/last rather than begin/end for symmetry of moving forward/backward.
	std::shared_ptr<ArrType> _typed_arr;
	difference_type _current = 0;
	difference_type _last = 0;
	// Cache position across all chunks for support of random access.
	difference_type _position = 0;
};

class ChunkedArrayIteratorString :
	public ChunkedArrayIteratorIndexImpl<StringArray>,
	public boost::iterator_facade<ChunkedArrayIteratorString, const std::string, boost::random_access_traversal_tag>
{
public:
	using difference_type = int64_t;
	using Value = const std::string;

	explicit ChunkedArrayIteratorString(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, difference_type pos = 0)
		: ChunkedArrayIteratorIndexImpl(ch_arr, pos)
	{
	}

	// Cache value to avoid returning pointer to temp.
    Value& dereference() const { _cached = _typed_arr->GetString(_current); return _cached; }

private:
	mutable std::string _cached;
};

class ChunkedArrayIteratorBoolean :
	public ChunkedArrayIteratorIndexImpl<BooleanArray>,
	public boost::iterator_facade<ChunkedArrayIteratorBoolean, const bool, boost::random_access_traversal_tag>
{
public:
	using difference_type = int64_t;
	using Value = const bool;

	explicit ChunkedArrayIteratorBoolean(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, difference_type pos = 0)
		: ChunkedArrayIteratorIndexImpl(ch_arr, pos)
	{
	}

	// Cache value to avoid returning pointer to temp.
    Value& dereference() const { _cached = _typed_arr->GetView(_current); return _cached; }

private:
	mutable bool _cached;
};

// STL container-like cover for arrow::ChunkedArray.
// Only works for ChunkedArrays composed of Array types that support raw_values().
template<typename ArrType>
class ChunkedArrayCoverRaw
{
public:
	using T = typename ArrType::value_type;
	// Match size_type to Array offsets rather than using size_t and ptrdiff_t.
	using size_type = int64_t;
	using difference_type = int64_t;
	using iterator = typename ChunkedArrayIteratorRaw<ArrType, T>;
	using const_iterator = typename ChunkedArrayIteratorRaw<ArrType, const T>;;
	using reverse_iterator = iterator;
	using const_reverse_iterator = const_iterator;

	ChunkedArrayCoverRaw(std::shared_ptr<ChunkedArray>& array) : _array(array) {}

	size_type size() const { return _array->length(); }

	// Should non-const versions fail if Array is immutable?
	iterator begin() { return iterator(_array); }
	iterator end() { return iterator(_array, size()); }
	reverse_iterator rbegin() { return iterator(_array, size()-1); }
	reverse_iterator rend() { return iterator(_array, -1); }
	const_iterator cbegin() const { return const_iterator(_array); }
	const_iterator cend() const { return const_iterator(_array, size()); }
	const_reverse_iterator crbegin() const { return const_iterator(_array, size()-1); }
	const_reverse_iterator crend() const { return const_iterator(_array, -1); }

protected:
	std::shared_ptr<ChunkedArray> _array;
};


int main(int argc, char *argv[])
{
	auto fs = make_shared<fs::LocalFileSystem>();
	auto r_input = fs->OpenInputStream("c:/temp/_DatasetP14Seizures.csv");

	auto pool = default_memory_pool();
	auto read_options = arrow::csv::ReadOptions::Defaults();
	auto parse_options = arrow::csv::ParseOptions::Defaults();
	auto convert_options = arrow::csv::ConvertOptions::Defaults();

	auto r_table_reader = csv::TableReader::Make(pool, r_input.ValueOrDie(),
		read_options, parse_options, convert_options);
	auto r_read = r_table_reader.ValueOrDie()->Read();
	auto pTable = r_read.ValueOrDie();

	PrettyPrintOptions options{0};
	arrow::PrettyPrint(*pTable, options, &std::cout);

	// Test covers and iterators.
	const Table& tlb = *pTable;
	const int64_t rows = tlb.num_rows();
	const int cols = tlb.num_columns();
	for (int c = 0; c < cols; c++)
	{
		auto f = tlb.field(c);
		const string& name = f->name();
		int type_id = f->type()->id();
		auto ch_arr = tlb.column(c);
		switch (type_id)
		{
		case Type::DOUBLE:
			{
#if 0
				using iterator = ChunkedArrayIteratorRaw<arrow::DoubleArray, double>;
				iterator it(ch_arr, 2);
				cout << it.IsNull() << endl;
				boost::iterator_range<iterator> range(it-2, it+8);
				for (double val : range)
				 	cout << val << endl;
#else
				using cover = ChunkedArrayCoverRaw<arrow::DoubleArray>;
				using iterator = typename cover::iterator;
				using range = typename boost::iterator_range<iterator>;
				cover cvr(ch_arr);
				auto begin = cvr.begin();
				auto end = cvr.end();
				auto rbegin = cvr.rbegin();
				auto rend = cvr.rend();
				auto it = begin;
				it += 2;
				cout << it.IsNull() << endl;
				range rng(it-2, it+8);
				for (double val : rng)
				 	cout << val << endl;
#endif
			}
			break;
		case Type::STRING:
			{
				using iterator = ChunkedArrayIteratorString;
				iterator it(ch_arr, 2);
				cout << it.IsNull() << endl;
				boost::iterator_range<iterator> range(it-2, it+8);
				for (std::string val : range)
				 	cout << val << endl;
			}
			break;
		case Type::BOOL:
			{
				using iterator = ChunkedArrayIteratorBoolean;
				iterator it(ch_arr, 2);
				cout << it.IsNull() << endl;
				boost::iterator_range<iterator> range(it-2, it+8);
				for (bool val : range)
				 	cout << val << endl;
			}
			break;

		default:
			break;
		}
	}

	return 1;
}


{code}

> [C++] Provide iterator access to primitive elements inside a Column/ChunkedArray
> --------------------------------------------------------------------------------
>
>                 Key: ARROW-602
>                 URL: https://issues.apache.org/jira/browse/ARROW-602
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++
>            Reporter: Uwe Korn
>            Assignee: Ben Kietzman
>            Priority: Major
>              Labels: beginner, newbie
>
> Given a ChunkedArray, an Arrow user must currently iterate over all its chunks and then cast them to their types to extract the primitive memory regions to access the values. A convenient way to access the underlying values would be to offer a function that takes a ChunkedArray and returns a C++ iterator over all elements.
> While this may not be the most performant way to access the underlying data, it should have sufficient performance and adds a convenience layer for new users.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)