From 21e80e603425cf4559c36b21622e517bf30b8b21 Mon Sep 17 00:00:00 2001 From: Hide Kojima Date: Mon, 1 Jun 2026 14:37:02 -0700 Subject: [PATCH] fix: auto-detect and materialize 2-digit-year (M/D/YY) dates and datetimes The year-last date heuristic only accepted 4-digit years, and the date collector never used it at all: CollectorGuess.cpp guessed year-last dates as "date", but CollectorDate::setValue used only parseLocaleDate(), so even 4-digit M/D/YYYY dates materialized to NA. Datetime auto-detection only handled ISO8601. - DateTimeParser.h: add consumeYearFlexible() (2- or 4-digit year, %y pivot 00-68->2000s / 69-99->1900s, reject 3-digit); route parseYearLastHeuristic() and parseDateOrder() 'y' case through it; add disambiguateDayMonth() and parseYearLastHeuristicDateTime(). - Collector.cpp: CollectorDate::setValue falls back to parseYearLastHeuristic when parseLocaleDate fails; CollectorDateTime::setValue falls back to parseYearLastHeuristicDateTime when parseISO8601 fails. - CollectorGuess.cpp isDateTime(): year-last datetime fallback after ISO8601. - Tests: 2-digit MDY/DMY dates, %y pivot, invalid/3-digit rejection, the 4-digit materialization regression, 2- and 4-digit MDY datetimes, and explicit date_order with 2-digit years. Bump version to 2.2.0.3. Ref: exploratory-io/tam#36088 Co-Authored-By: Claude Opus 4.8 --- DESCRIPTION | 2 +- src/Collector.cpp | 14 ++++ src/CollectorGuess.cpp | 14 ++-- src/DateTimeParser.h | 88 +++++++++++++++++++------- tests/testthat/test-parsing-datetime.R | 75 ++++++++++++++++++++++ 5 files changed, 165 insertions(+), 28 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 605de545..9aa94708 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: readr Title: Read Rectangular Text Data -Version: 2.2.0.2 +Version: 2.2.0.3 Authors@R: c( person("Hadley", "Wickham", , "hadley@posit.co", role = "aut"), person("Jim", "Hester", role = "aut"), diff --git a/src/Collector.cpp b/src/Collector.cpp index 4e6db71f..c8a7068f 100644 --- a/src/Collector.cpp +++ b/src/Collector.cpp @@ -122,6 +122,13 @@ void CollectorDate::setValue(int i, const Token& t) { res = parser_.parseDateOrder(pLocale_->dateOrder_); } else { res = parser_.parseLocaleDate(); + if (!res) { + // Fall back to the year-last (M/D/Y or D/M/Y) heuristic so MDY/DMY + // dates (including 2-digit years) materialize, matching the guesser + // in CollectorGuess.cpp isDate(). (Issue #36088) + parser_.setDate(std_string.c_str()); + res = parser_.parseYearLastHeuristic(); + } } if (!res) { @@ -165,6 +172,13 @@ void CollectorDateTime::setValue(int i, const Token& t) { res = parser_.parseDateOrder(pLocale_->dateOrder_); } else { res = parser_.parseISO8601(); + if (!res) { + // Fall back to the year-last (M/D/Y or D/M/Y) datetime heuristic so + // MDY/DMY datetimes (including 2-digit years) materialize, matching + // the guesser in CollectorGuess.cpp isDateTime(). (Issue #36088) + parser_.setDate(std_string.c_str()); + res = parser_.parseYearLastHeuristicDateTime(); + } } if (!res) { diff --git a/src/CollectorGuess.cpp b/src/CollectorGuess.cpp index 8f3b5914..a2878b42 100644 --- a/src/CollectorGuess.cpp +++ b/src/CollectorGuess.cpp @@ -132,12 +132,16 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { return false; } - // Auto-detection: ISO8601 only (YMD, existing behavior — no change) - bool ok = parser.parseISO8601(); - if (!ok) return false; + // Auto-detection: ISO8601 first (YMD), then year-last (M/D/Y or D/M/Y) + // heuristic so MDY/DMY datetimes (including 2-digit years) are recognized. + // (Issue #36088) + if (parser.parseISO8601()) { + if (!parser.compactDate()) return true; + return parser.year() > 999; + } - if (!parser.compactDate()) return true; - return parser.year() > 999; + parser.setDate(x.c_str()); + return parser.parseYearLastHeuristicDateTime(); } [[cpp11::register]] std::string collectorGuess( diff --git a/src/DateTimeParser.h b/src/DateTimeParser.h index 0e322b7f..6a404b5b 100644 --- a/src/DateTimeParser.h +++ b/src/DateTimeParser.h @@ -145,7 +145,7 @@ class DateTimeParser { switch (datePart[i]) { case 'y': - if (!consumeInteger(4, &year_)) return false; + if (!consumeYearFlexible()) return false; break; case 'm': if (!consumeInteger(2, &mon_, false)) return false; @@ -192,43 +192,87 @@ class DateTimeParser { return isComplete(); } - // Heuristic for year-last date patterns: D/M/YYYY or M/D/YYYY - // Matches: \d{1,2}[sep]\d{1,2}[sep]\d{4} - // Disambiguation: if part1 > 12 → DMY; if part2 > 12 → MDY; else → MDY (default) - bool parseYearLastHeuristic() { - int part1, part2; - - if (!consumeInteger(2, &part1, false)) return false; - if (!consumeDateSeparator()) return false; - if (!consumeInteger(2, &part2, false)) return false; - if (!consumeDateSeparator()) return false; - if (!consumeInteger(4, &year_)) return false; - if (!isComplete()) return false; - - // Validate year is plausible - if (year_ < 1000) return false; + // Consume a year that may be 2 or 4 digits. 2-digit years use the same pivot + // as the %y format specifier (00-68 -> 2000s, 69-99 -> 1900s). 3-digit values + // (100-999) are implausible and rejected. (Issue #36088) + bool consumeYearFlexible() { + if (!consumeInteger(4, &year_, false)) return false; + if (year_ < 100) { + year_ += (year_ < 69) ? 2000 : 1900; + } else if (year_ < 1000) { + return false; + } + return true; + } + // Disambiguate a year-last date's first two components into month and day. + // part1 > 12 -> DMY; part2 > 12 -> MDY; otherwise default to MDY (US). + // Returns false if the resulting month/day are out of range. + bool disambiguateDayMonth(int part1, int part2) { if (part1 > 12) { - // Must be DMY day_ = part1; mon_ = part2; } else if (part2 > 12) { - // Must be MDY mon_ = part1; day_ = part2; } else { - // Ambiguous: default to MDY (US convention) mon_ = part1; day_ = part2; } - - // Validate month and day are in plausible range if (mon_ < 1 || mon_ > 12) return false; if (day_ < 1 || day_ > 31) return false; - return true; } + // Heuristic for year-last date patterns: D/M/Y or M/D/Y (Y = 2 or 4 digits) + // Matches: \d{1,2}[sep]\d{1,2}[sep]\d{2,4} + // Disambiguation: if part1 > 12 → DMY; if part2 > 12 → MDY; else → MDY (default) + bool parseYearLastHeuristic() { + int part1, part2; + + if (!consumeInteger(2, &part1, false)) return false; + if (!consumeDateSeparator()) return false; + if (!consumeInteger(2, &part2, false)) return false; + if (!consumeDateSeparator()) return false; + if (!consumeYearFlexible()) return false; + if (!isComplete()) return false; + + return disambiguateDayMonth(part1, part2); + } + + // Year-last datetime heuristic: a year-last date (M/D/Y or D/M/Y, 2 or 4 digit + // year) followed by a T/space separator and a HH[:MM[:SS]] time with optional + // timezone. Mirrors the time tail of parseISO8601. (Issue #36088) + bool parseYearLastHeuristicDateTime() { + int part1, part2; + + if (!consumeInteger(2, &part1, false)) return false; + if (!consumeDateSeparator()) return false; + if (!consumeInteger(2, &part2, false)) return false; + if (!consumeDateSeparator()) return false; + if (!consumeYearFlexible()) return false; + if (!disambiguateDayMonth(part1, part2)) return false; + + // Time portion is required (date-only is handled by parseYearLastHeuristic). + char next; + if (!consumeChar(&next)) return false; + if (next != 'T' && next != ' ') return false; + + if (!consumeInteger(2, &hour_)) return false; + consumeThisChar(':'); + consumeInteger(2, &min_); + consumeThisChar(':'); + consumeSeconds(&sec_, &psec_); + + if (isComplete()) return true; + + // Optional timezone + tz_ = "UTC"; + if (!consumeTzOffset(&tzOffsetHours_, &tzOffsetMinutes_)) return false; + + return isComplete(); + } + bool isComplete() { return dateItr_ == dateEnd_; } void setDate(const char* date) { diff --git a/tests/testthat/test-parsing-datetime.R b/tests/testthat/test-parsing-datetime.R index 1502adfa..2a2fa125 100644 --- a/tests/testthat/test-parsing-datetime.R +++ b/tests/testthat/test-parsing-datetime.R @@ -446,3 +446,78 @@ test_that("read_csv() reads dot-separated MDY dates", { expect_s3_class(result$date, "Date") expect_equal(result$date, as.Date(c("2024-10-02", "2024-03-15"))) }) + +# --- 2-digit-year (M/D/YY) auto-detection (Issue exploratory-io/tam#36088) --- + +test_that("read_csv() auto-detects 2-digit-year MDY dates (M/D/YY)", { + csv <- "id,date\n1,5/29/26\n2,5/31/26\n3,12/25/26" + result <- read_csv(I(csv), show_col_types = FALSE) + expect_s3_class(result$date, "Date") + expect_equal(result$date, as.Date(c("2026-05-29", "2026-05-31", "2026-12-25"))) +}) + +test_that("read_csv() auto-detects 2-digit-year DMY dates (D/M/YY)", { + # 29 > 12 in first part: unambiguously DMY + csv <- "id,date\n1,29/5/26\n2,20/1/26" + result <- read_csv(I(csv), show_col_types = FALSE) + expect_s3_class(result$date, "Date") + expect_equal(result$date, as.Date(c("2026-05-29", "2026-01-20"))) +}) + +test_that("type_convert() materializes 4-digit-year MDY dates (regression: was NA)", { + # The guesser returned "date" but CollectorDate::setValue used only + # parseLocaleDate(), so year-last dates materialized to NA. (#36088) + df <- data.frame(date = c("5/29/2026", "3/15/2026"), stringsAsFactors = FALSE) + result <- type_convert(df, guess_integer = FALSE) + expect_s3_class(result$date, "Date") + expect_equal(result$date, as.Date(c("2026-05-29", "2026-03-15"))) +}) + +test_that("read_csv() applies the %y pivot to 2-digit years", { + csv <- "id,date\n1,5/29/68\n2,5/29/69" + result <- read_csv(I(csv), show_col_types = FALSE) + expect_s3_class(result$date, "Date") + expect_equal(result$date, as.Date(c("2068-05-29", "1969-05-29"))) +}) + +test_that("guess_parser detects 2-digit-year year-last dates", { + expect_equal(guess_parser(c("5/29/26", "5/31/26"), locale()), "date") +}) + +test_that("read_csv() does not treat invalid or 3-digit-year values as dates", { + for (v in c("13/25/26", "100/200/300")) { + result <- read_csv(I(paste0("x\n", v, "\n")), show_col_types = FALSE) + expect_type(result$x, "character") + } +}) + +test_that("read_csv() auto-detects 2-digit-year MDY datetimes (M/D/YY HH:MM:SS)", { + csv <- "id,dt\n1,5/29/26 14:30:00\n2,12/25/26 23:59:59" + result <- read_csv(I(csv), show_col_types = FALSE) + expect_s3_class(result$dt, "POSIXct") + expect_equal( + result$dt, + as.POSIXct(c("2026-05-29 14:30:00", "2026-12-25 23:59:59"), tz = "UTC") + ) +}) + +test_that("read_csv() auto-detects 4-digit-year MDY datetimes (M/D/YYYY HH:MM:SS)", { + csv <- "id,dt\n1,5/29/2026 14:30:00\n2,10/15/2024 09:00:00" + result <- read_csv(I(csv), show_col_types = FALSE) + expect_s3_class(result$dt, "POSIXct") + expect_equal( + result$dt, + as.POSIXct(c("2026-05-29 14:30:00", "2024-10-15 09:00:00"), tz = "UTC") + ) +}) + +test_that("parse_date parses 2-digit-year MDY/DMY with locale date_order", { + expect_equal( + parse_date("5/29/26", locale = locale(date_order = "mdy")), + as.Date("2026-05-29") + ) + expect_equal( + parse_date("29/5/26", locale = locale(date_order = "dmy")), + as.Date("2026-05-29") + ) +})