diff --git a/Cargo.lock b/Cargo.lock index 8265fc7..5885517 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3153,6 +3153,9 @@ dependencies = [ "plateau-test-arrow-rs", "plateau-transport-arrow-rs", "reqwest", + "sample-arrow-rs", + "sample-std", + "sample-test", "serde", "serde_json", "tempfile", @@ -4277,6 +4280,19 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "sample-arrow-rs" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6a4de828117008194c83e10a0cee51794f966859305c90f29553cebe48f682" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "sample-std", +] + [[package]] name = "sample-arrow2" version = "0.17.2" diff --git a/arrow-rs/data/Cargo.toml b/arrow-rs/data/Cargo.toml index c22da6e..7b53d4b 100644 --- a/arrow-rs/data/Cargo.toml +++ b/arrow-rs/data/Cargo.toml @@ -54,9 +54,9 @@ test-log = { version = "0.2", default-features = false, features = ["trace"] } uuid = { version = "1.10", features = ["v4"] } # Remove sample-arrow2 as we're migrating to arrow-rs -# sample-test = "0.2.1" -# sample-std = "0.2.1" -# sample-arrow2 = "0.17.2" +sample-test = "0.2.1" +sample-std = "0.2.1" +sample-arrow-rs = "55.2.0" reqwest.workspace = true diff --git a/arrow-rs/data/src/segment.rs b/arrow-rs/data/src/segment.rs index 94c95e3..9d99f3b 100644 --- a/arrow-rs/data/src/segment.rs +++ b/arrow-rs/data/src/segment.rs @@ -384,11 +384,15 @@ pub mod test { use crate::test::inferences_schema_a; // Use arrow-rs transport use plateau_transport_arrow_rs as transport; - use transport::SchemaChunk; - // Remove sample_arrow2 and sample_std imports for now since we removed these dependencies - // We'll reimplement test functionality using arrow-rs libraries + use sample_arrow_rs::{ + array::ArbitraryArray, + chunk::ArbitraryChunk, + datatypes::{sample_flat, ArbitraryDataType}, + }; + use sample_std::{Chance, Regex}; use tempfile::tempdir; use test::arrow::test::partial_write; + use transport::SchemaChunk; impl Config { pub fn nocommit() -> Self { @@ -428,8 +432,6 @@ pub mod test { } // nulls=true breaks arrow2's parquet support, but is fine for feather - // XXX - need to create sample-arrow first - /* pub fn deep_chunk(depth: usize, len: usize, nulls: bool) -> ArbitraryChunk { let names = Regex::new("[a-z]{4,8}"); let data_type = ArbitraryDataType { @@ -457,7 +459,6 @@ pub mod test { array, } } - */ #[test] fn test_interrupted_cache_write() -> Result<()> { diff --git a/arrow-rs/data/src/segment/arrow.rs b/arrow-rs/data/src/segment/arrow.rs index d735d5e..ee7a01b 100644 --- a/arrow-rs/data/src/segment/arrow.rs +++ b/arrow-rs/data/src/segment/arrow.rs @@ -303,11 +303,10 @@ pub mod test { // Fix imports to use arrow-rs versions use plateau_transport_arrow_rs as transport; use transport::SchemaChunk; - // Temporarily comment out sample_arrow2 dependencies - // use sample_arrow2::chunk::{ChainedChunk, ChainedMultiChunk}; - // use sample_std::{Random, Regex, Sample}; - // use sample_test::sample_test; - // use crate::segment::test::deep_chunk; + // Use sample-arrow-rs for property-based testing + use crate::segment::test::deep_chunk; + use sample_arrow_rs::chunk::{ChainedChunk, ChainedMultiChunk}; + use sample_test::sample_test; use tempfile::tempdir; use super::*; @@ -537,76 +536,50 @@ pub mod test { Ok(()) } - // Temporarily comment out tests that depend on sample_arrow2 - /* + // Property-based tests using sample-arrow-rs #[sample_test] + #[test_log::test] fn arbitrary_chunk(#[sample(deep_chunk(3, 100, true).sample_one())] chunk: ChainedChunk) { - let chunk = chunk.value; + // In arrow-rs, RecordBatch already contains its schema + let batch = chunk.value; let root = tempdir().unwrap(); let path = root.path().join("testing.arrow"); + trace!(?batch); + + // The RecordBatch already has a schema embedded + let schema = (*batch.schema()).clone(); - use sample_std::Sample; - let mut name = Regex::new("[a-z]{4, 8}"); - let mut g = Random::new(); - - let schema = Schema { - fields: chunk - .iter() - .map(|arr| { - Field::new( - name.generate(&mut g), - arr.data_type().clone(), - arr.validity().is_some(), - ) - }) - .collect(), - metadata: Metadata::default(), - }; let mut w = Writer::create_path(&path, &schema).unwrap(); - w.write_chunk(chunk.clone()).unwrap(); + w.write_chunk(batch.clone()).unwrap(); w.end().unwrap(); let r = Reader::open(&path).unwrap(); let chunks = r.collect::>>().unwrap(); - assert_eq!(chunks, vec![chunk]); + assert_eq!(chunks, vec![batch]); } #[sample_test] + #[test_log::test] fn arbitrary_many_chunk( #[sample(deep_chunk(5, 100, true).sample_many(2..10))] chunk: ChainedMultiChunk, ) { - let chunks = chunk.value; + let batches = chunk.value; + trace!(?batches); let root = tempdir().unwrap(); let path = root.path().join("testing.arrow"); - let mut name = Regex::new("[a-z]{4, 8}"); - let mut g = Random::new(); - - let schema = Schema { - fields: chunks - .first() - .unwrap() - .iter() - .map(|arr| { - Field::new( - name.generate(&mut g), - arr.data_type().clone(), - arr.validity().is_some(), - ) - }) - .collect(), - metadata: Metadata::default(), - }; + // Get schema from first batch - all batches should have same schema + let schema = (*batches.first().unwrap().schema()).clone(); + let mut w = Writer::create_path(&path, &schema).unwrap(); - for chunk in &chunks { - w.write_chunk(chunk.clone()).unwrap(); + for batch in &batches { + w.write_chunk(batch.clone()).unwrap(); } w.end().unwrap(); let r = Reader::open(&path).unwrap(); - let actual_chunks = r.collect::>>().unwrap(); - assert_eq!(actual_chunks, chunks); + let actual_batches = r.collect::>>().unwrap(); + assert_eq!(actual_batches, batches); } - */ }