Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions cpp/src/interop/from_arrow_device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
auto const d_items = view.buffer_views[binary_view_vector_idx].data.as_binary_view;
auto variadic_ptrs = std::vector<char const*>();
for (auto i = 0L; i < view.n_variadic_buffers; ++i) {
auto variadic_buf =
ArrowArrayBuffer(const_cast<ArrowArray*>(input), i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS);
variadic_ptrs.push_back(reinterpret_cast<char const*>(variadic_buf->data));
variadic_ptrs.push_back(reinterpret_cast<char const*>(view.variadic_buffers[i]));
}
auto d_variadic_ptrs = cudf::detail::make_device_uvector_async(
variadic_ptrs, stream, cudf::get_current_device_resource_ref());
Expand Down
160 changes: 126 additions & 34 deletions cpp/tests/interop/from_arrow_device_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,51 +536,143 @@ TEST_F(FromArrowDeviceTest, StringViewType)
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema.children[0], NANOARROW_TYPE_STRING_VIEW));
NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema.children[0], "a"));

nanoarrow::UniqueArray input_array;
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), &schema, nullptr));
input_array->length = input.length;
input_array->null_count = input.null_count;
auto device_array = input_array->children[0];
device_array->length = input.length;
device_array->null_count = input.null_count;

// Build the device-array buffers:
// buffers[0]=validity, [1]=views,
// [2..2+N-1]=variadic data ptrs, [2+N]=variadic sizes

// validity buffer
NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, 0), noop_alloc));
ArrowArrayBuffer(device_array, 0)->size_bytes =
cudf::bitmask_allocation_size_bytes(expected_view.size());
ArrowArrayBuffer(device_array, 0)->data =
const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(expected_view.null_mask()));
// views buffer
NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, 1), noop_alloc));
ArrowArrayBuffer(device_array, 1)->size_bytes = d_items.size() * sizeof(ArrowBinaryView);
ArrowArrayBuffer(device_array, 1)->data = reinterpret_cast<uint8_t*>(d_items.data());
// variadic buffers
NANOARROW_THROW_NOT_OK(ArrowArrayAddVariadicBuffers(device_array, variadics.size()));
for (std::size_t i = 0; i < variadics.size(); ++i) {
auto const buffer_idx = i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
NANOARROW_THROW_NOT_OK(
ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, buffer_idx), noop_alloc));
ArrowArrayBuffer(device_array, buffer_idx)->size_bytes = variadics[i].size();
ArrowArrayBuffer(device_array, buffer_idx)->data = reinterpret_cast<uint8_t*>(variadic_ptrs[i]);
// not sure how the private_data variadic_buffer should be set
auto variadic_sizes = std::vector<int64_t>();
for (auto const& buf : variadics) {
variadic_sizes.push_back(static_cast<int64_t>(buf.size()));
}
NANOARROW_THROW_NOT_OK(
ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));

// Arrow C Data STRING_VIEW layout: [validity, views, variadic0..N-1, variadic_sizes]
auto child_buffers =
std::vector<void const*>(NANOARROW_BINARY_VIEW_FIXED_BUFFERS + variadic_ptrs.size() + 1);
child_buffers[0] = expected_view.null_mask();
child_buffers[1] = d_items.data();
for (std::size_t i = 0; i < variadic_ptrs.size(); ++i) {
child_buffers[i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS] = variadic_ptrs[i];
}
child_buffers.back() = variadic_sizes.data();

ArrowArray child_array{};
child_array.length = input.length;
child_array.null_count = expected_view.null_count();
child_array.offset = 0;
child_array.n_buffers = static_cast<int64_t>(child_buffers.size());
child_array.n_children = 0;
child_array.buffers = child_buffers.data();
child_array.children = nullptr;
child_array.dictionary = nullptr;
child_array.release = nullptr;

ArrowArray* children[] = {&child_array};
void const* struct_buffers[] = {nullptr};
ArrowArray struct_array{};
struct_array.length = input.length;
struct_array.null_count = 0;
struct_array.offset = 0;
struct_array.n_buffers = 1;
struct_array.n_children = 1;
struct_array.buffers = struct_buffers;
struct_array.children = children;
struct_array.dictionary = nullptr;
struct_array.release = nullptr;
Comment thread
0ax1 marked this conversation as resolved.

ArrowDeviceArray input_device_array;
input_device_array.device_id = rmm::get_current_cuda_device().value();
input_device_array.device_type = ARROW_DEVICE_CUDA;
input_device_array.sync_event = nullptr;
memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
input_device_array.array = struct_array;

auto got_cudf_table_view = cudf::from_arrow_device(&schema, &input_device_array);
CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view({expected_col}), *got_cudf_table_view);
}

TEST_F(FromArrowDeviceTest, StringViewTypeWithProducerOwnedPrivateData)
{
auto data = std::vector<std::string>({"hello",
"worldy",
"much longer string",
"",
"another even longer string",
"",
"other string"});

auto validity = std::vector<bool>{true, true, true, false, true, true, true};
auto expected_col =
cudf::test::strings_column_wrapper(data.begin(), data.end(), validity.begin());
auto expected_view = cudf::column_view(expected_col);

nanoarrow::UniqueArray input;
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(input.get(), NANOARROW_TYPE_STRING_VIEW));
NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(input.get()));
for (auto const& str : data) {
auto item = ArrowStringView{str.c_str(), static_cast<int64_t>(str.size())};
NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(input.get(), item));
}
NANOARROW_THROW_NOT_OK(
ArrowArrayFinishBuilding(input.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));

nanoarrow::UniqueSchema schema;
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_STRING_VIEW));
ArrowArrayView view;
NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&view, schema.get(), nullptr));
NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&view, input.get(), nullptr));
ASSERT_GT(view.n_variadic_buffers, 0);

auto stream = cudf::get_default_stream();
auto items = view.buffer_views[1].data.as_binary_view;
auto d_items = rmm::device_uvector<ArrowBinaryView>(input->length, stream);
CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
items,
input->length * sizeof(ArrowBinaryView),
cudaMemcpyDefault,
stream.value()));
auto variadics = std::vector<rmm::device_buffer>();
auto variadic_ptrs = std::vector<char*>();
for (auto i = 0L; i < view.n_variadic_buffers; ++i) {
variadics.emplace_back(view.variadic_buffers[i], view.variadic_buffer_sizes[i], stream);
variadic_ptrs.push_back(static_cast<char*>(variadics.back().data()));
}
stream.synchronize();

auto variadic_sizes = std::vector<int64_t>();
for (auto i = 0L; i < view.n_variadic_buffers; ++i) {
variadic_sizes.push_back(view.variadic_buffer_sizes[i]);
}

// Arrow C Data STRING_VIEW layout: [validity, views, variadic0..N-1, variadic_sizes]
auto device_buffers =
std::vector<void const*>(NANOARROW_BINARY_VIEW_FIXED_BUFFERS + variadic_ptrs.size() + 1);
device_buffers[0] = expected_view.null_mask();
device_buffers[1] = d_items.data();
for (std::size_t i = 0; i < variadic_ptrs.size(); ++i) {
device_buffers[i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS] = variadic_ptrs[i];
}
device_buffers.back() = variadic_sizes.data();

int producer_private_data = 0;
ArrowArray device_array{};
device_array.length = input->length;
device_array.null_count = expected_view.null_count();
device_array.offset = 0;
device_array.n_buffers = static_cast<int64_t>(device_buffers.size());
device_array.n_children = 0;
device_array.buffers = device_buffers.data();
device_array.children = nullptr;
device_array.dictionary = nullptr;
device_array.release = nullptr;
device_array.private_data = &producer_private_data;

ArrowDeviceArray input_device_array;
input_device_array.device_id = rmm::get_current_cuda_device().value();
input_device_array.device_type = ARROW_DEVICE_CUDA;
input_device_array.sync_event = nullptr;
input_device_array.array = device_array;

// Mirrors external Arrow C Device producers where ArrowArray.private_data is producer-owned
// and not Nanoarrow state.
auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_view, *got_cudf_col);
}

void slice_nanoarrow(ArrowArray* arr, int64_t start, int64_t end)
{
auto op = [&](ArrowArray* array) {
Expand Down
Loading