diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 1c0ae079ef1..42d7047eea7 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -179,9 +179,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( auto const d_items = view.buffer_views[binary_view_vector_idx].data.as_binary_view; auto variadic_ptrs = std::vector(); for (auto i = 0L; i < view.n_variadic_buffers; ++i) { - auto variadic_buf = - ArrowArrayBuffer(const_cast(input), i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS); - variadic_ptrs.push_back(reinterpret_cast(variadic_buf->data)); + variadic_ptrs.push_back(reinterpret_cast(view.variadic_buffers[i])); } auto d_variadic_ptrs = cudf::detail::make_device_uvector_async( variadic_ptrs, stream, cudf::get_current_device_resource_ref()); diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 2d8f9e3f7d4..0e50a5ecda2 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -536,51 +536,143 @@ TEST_F(FromArrowDeviceTest, StringViewType) NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema.children[0], NANOARROW_TYPE_STRING_VIEW)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema.children[0], "a")); - nanoarrow::UniqueArray input_array; - NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), &schema, nullptr)); - input_array->length = input.length; - input_array->null_count = input.null_count; - auto device_array = input_array->children[0]; - device_array->length = input.length; - device_array->null_count = input.null_count; - - // Build the device-array buffers: - // buffers[0]=validity, [1]=views, - // [2..2+N-1]=variadic data ptrs, [2+N]=variadic sizes - - // validity buffer - NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, 0), noop_alloc)); - ArrowArrayBuffer(device_array, 0)->size_bytes = - cudf::bitmask_allocation_size_bytes(expected_view.size()); - ArrowArrayBuffer(device_array, 0)->data = - const_cast(reinterpret_cast(expected_view.null_mask())); - // views buffer - NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, 1), noop_alloc)); - ArrowArrayBuffer(device_array, 1)->size_bytes = d_items.size() * sizeof(ArrowBinaryView); - ArrowArrayBuffer(device_array, 1)->data = reinterpret_cast(d_items.data()); - // variadic buffers - NANOARROW_THROW_NOT_OK(ArrowArrayAddVariadicBuffers(device_array, variadics.size())); - for (std::size_t i = 0; i < variadics.size(); ++i) { - auto const buffer_idx = i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS; - NANOARROW_THROW_NOT_OK( - ArrowBufferSetAllocator(ArrowArrayBuffer(device_array, buffer_idx), noop_alloc)); - ArrowArrayBuffer(device_array, buffer_idx)->size_bytes = variadics[i].size(); - ArrowArrayBuffer(device_array, buffer_idx)->data = reinterpret_cast(variadic_ptrs[i]); - // not sure how the private_data variadic_buffer should be set + auto variadic_sizes = std::vector(); + for (auto const& buf : variadics) { + variadic_sizes.push_back(static_cast(buf.size())); } - NANOARROW_THROW_NOT_OK( - ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + // Arrow C Data STRING_VIEW layout: [validity, views, variadic0..N-1, variadic_sizes] + auto child_buffers = + std::vector(NANOARROW_BINARY_VIEW_FIXED_BUFFERS + variadic_ptrs.size() + 1); + child_buffers[0] = expected_view.null_mask(); + child_buffers[1] = d_items.data(); + for (std::size_t i = 0; i < variadic_ptrs.size(); ++i) { + child_buffers[i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS] = variadic_ptrs[i]; + } + child_buffers.back() = variadic_sizes.data(); + + ArrowArray child_array{}; + child_array.length = input.length; + child_array.null_count = expected_view.null_count(); + child_array.offset = 0; + child_array.n_buffers = static_cast(child_buffers.size()); + child_array.n_children = 0; + child_array.buffers = child_buffers.data(); + child_array.children = nullptr; + child_array.dictionary = nullptr; + child_array.release = nullptr; + + ArrowArray* children[] = {&child_array}; + void const* struct_buffers[] = {nullptr}; + ArrowArray struct_array{}; + struct_array.length = input.length; + struct_array.null_count = 0; + struct_array.offset = 0; + struct_array.n_buffers = 1; + struct_array.n_children = 1; + struct_array.buffers = struct_buffers; + struct_array.children = children; + struct_array.dictionary = nullptr; + struct_array.release = nullptr; ArrowDeviceArray input_device_array; input_device_array.device_id = rmm::get_current_cuda_device().value(); input_device_array.device_type = ARROW_DEVICE_CUDA; input_device_array.sync_event = nullptr; - memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray)); + input_device_array.array = struct_array; auto got_cudf_table_view = cudf::from_arrow_device(&schema, &input_device_array); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view({expected_col}), *got_cudf_table_view); } +TEST_F(FromArrowDeviceTest, StringViewTypeWithProducerOwnedPrivateData) +{ + auto data = std::vector({"hello", + "worldy", + "much longer string", + "", + "another even longer string", + "", + "other string"}); + + auto validity = std::vector{true, true, true, false, true, true, true}; + auto expected_col = + cudf::test::strings_column_wrapper(data.begin(), data.end(), validity.begin()); + auto expected_view = cudf::column_view(expected_col); + + nanoarrow::UniqueArray input; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(input.get(), NANOARROW_TYPE_STRING_VIEW)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(input.get())); + for (auto const& str : data) { + auto item = ArrowStringView{str.c_str(), static_cast(str.size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(input.get(), item)); + } + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + nanoarrow::UniqueSchema schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_STRING_VIEW)); + ArrowArrayView view; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&view, schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&view, input.get(), nullptr)); + ASSERT_GT(view.n_variadic_buffers, 0); + + auto stream = cudf::get_default_stream(); + auto items = view.buffer_views[1].data.as_binary_view; + auto d_items = rmm::device_uvector(input->length, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), + items, + input->length * sizeof(ArrowBinaryView), + cudaMemcpyDefault, + stream.value())); + auto variadics = std::vector(); + auto variadic_ptrs = std::vector(); + for (auto i = 0L; i < view.n_variadic_buffers; ++i) { + variadics.emplace_back(view.variadic_buffers[i], view.variadic_buffer_sizes[i], stream); + variadic_ptrs.push_back(static_cast(variadics.back().data())); + } + stream.synchronize(); + + auto variadic_sizes = std::vector(); + for (auto i = 0L; i < view.n_variadic_buffers; ++i) { + variadic_sizes.push_back(view.variadic_buffer_sizes[i]); + } + + // Arrow C Data STRING_VIEW layout: [validity, views, variadic0..N-1, variadic_sizes] + auto device_buffers = + std::vector(NANOARROW_BINARY_VIEW_FIXED_BUFFERS + variadic_ptrs.size() + 1); + device_buffers[0] = expected_view.null_mask(); + device_buffers[1] = d_items.data(); + for (std::size_t i = 0; i < variadic_ptrs.size(); ++i) { + device_buffers[i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS] = variadic_ptrs[i]; + } + device_buffers.back() = variadic_sizes.data(); + + int producer_private_data = 0; + ArrowArray device_array{}; + device_array.length = input->length; + device_array.null_count = expected_view.null_count(); + device_array.offset = 0; + device_array.n_buffers = static_cast(device_buffers.size()); + device_array.n_children = 0; + device_array.buffers = device_buffers.data(); + device_array.children = nullptr; + device_array.dictionary = nullptr; + device_array.release = nullptr; + device_array.private_data = &producer_private_data; + + ArrowDeviceArray input_device_array; + input_device_array.device_id = rmm::get_current_cuda_device().value(); + input_device_array.device_type = ARROW_DEVICE_CUDA; + input_device_array.sync_event = nullptr; + input_device_array.array = device_array; + + // Mirrors external Arrow C Device producers where ArrowArray.private_data is producer-owned + // and not Nanoarrow state. + auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_view, *got_cudf_col); +} + void slice_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) { auto op = [&](ArrowArray* array) {