From b926dfe472d806690b6d2187c8d86429eab54413 Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Thu, 21 Aug 2025 13:06:09 +0200 Subject: [PATCH 1/8] Relax restrictions for indirect link --- R/join_keys-extract.R | 4 +- R/join_keys-print.R | 10 +-- R/join_keys-utils.R | 64 +++++++++---------- ...arents.Rd => dot-append_indirect_links.Rd} | 11 ++-- tests/testthat/test-join_keys-extract.R | 22 +++++-- tests/testthat/test-join_keys-print.R | 6 +- vignettes/join-keys.Rmd | 6 +- 7 files changed, 68 insertions(+), 55 deletions(-) rename man/{update_keys_given_parents.Rd => dot-append_indirect_links.Rd} (53%) diff --git a/R/join_keys-extract.R b/R/join_keys-extract.R index a533c5045..b090dbb89 100644 --- a/R/join_keys-extract.R +++ b/R/join_keys-extract.R @@ -6,7 +6,7 @@ #' given `names`, including parent `names` and symmetric mirror keys between #' `names` in the result. #' - `x[i, j]`: Returns join keys between datasets `i` and `j`, -#' including implicit keys inferred from their relationship with a parent. +#' including implicit keys inferred from their relationship via foreign dataset. #' #' @param i,j indices specifying elements to extract or replace. Index should be a #' a character vector, but it can also take numeric, logical, `NULL` or missing. @@ -56,7 +56,7 @@ if (is.numeric(i)) i <- names(x)[i] if (is.numeric(j)) j <- names(x)[j] - subset_x <- update_keys_given_parents(x[union(i, j)]) + subset_x <- .append_indirect_links(x[union(i, j)]) return(subset_x[[i]][[j]]) } else if (!missing(j)) { # ie. select all keys which have j as dataset_2 diff --git a/R/join_keys-print.R b/R/join_keys-print.R index ad231fd78..914dfd0e9 100644 --- a/R/join_keys-print.R +++ b/R/join_keys-print.R @@ -6,7 +6,7 @@ format.join_keys <- function(x, ...) { my_parents <- parents(x) names_sorted <- topological_sort(my_parents) names <- union(names_sorted, names(x)) - x_implicit <- update_keys_given_parents(x) + x_indirect <- .append_indirect_links(x) out <- lapply(names, function(i) { out_i <- lapply(union(i, names(x[[i]])), function(j) { direction <- if (identical(my_parents[[j]], i)) { @@ -27,13 +27,13 @@ format.join_keys <- function(x, ...) { ) }) - implicit_datasets <- setdiff(names(x_implicit[[i]]), names(x[[i]])) - if (length(implicit_datasets) > 0) { + indirect_datasets <- setdiff(names(x_indirect[[i]]), names(x[[i]])) + if (length(indirect_datasets) > 0) { out_i <- c( out_i, paste0( - " --* (implicit via parent with): ", - paste(implicit_datasets, collapse = ", ") + " --* (indirect via foreign dataset): ", + paste(indirect_datasets, collapse = ", ") ) ) } diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index da084e972..9615f64e3 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -106,48 +106,46 @@ assert_compatible_keys2 <- function(x, y) { TRUE } -#' Updates the keys of the datasets based on the parents +#' Append indirect links +#' +#' Adds the keys between datasets if they are connected by the same set of keys +#' via other foreign datasets. #' #' @param x (`join_keys`) object to update the keys. #' #' @return (`self`) invisibly for chaining #' #' @keywords internal -update_keys_given_parents <- function(x) { - jk <- join_keys(x) - - checkmate::assert_class(jk, "join_keys", .var.name = checkmate::vname(x)) - - datanames <- names(jk) - for (d1_ix in seq_along(datanames)) { - d1 <- datanames[[d1_ix]] - d1_parent <- parent(jk, d1) - for (d2 in datanames[-1 * seq.int(d1_ix)]) { - if (length(jk[[d1]][[d2]]) == 0) { - d2_parent <- parent(jk, d2) - - if (!identical(d1_parent, d2_parent) || length(d1_parent) == 0) next - - # both has the same parent -> common keys to parent - keys_d1_parent <- sort(jk[[d1]][[d1_parent]]) - keys_d2_parent <- sort(jk[[d2]][[d2_parent]]) - - common_ix_1 <- unname(keys_d1_parent) %in% unname(keys_d2_parent) - common_ix_2 <- unname(keys_d2_parent) %in% unname(keys_d1_parent) - - # No common keys between datasets - leave empty - if (all(!common_ix_1)) next +.append_indirect_links <- function(x) { + for (node_name in names(x)) { + x <- .search_recursive(x, node_name) + } + x +} - fk <- structure( - names(keys_d2_parent)[common_ix_2], - names = names(keys_d1_parent)[common_ix_1] +.search_recursive <- function(x, node_name, parent_name, nodes_visited = node_name) { + children_names <- setdiff(names(x[[node_name]]), nodes_visited) + nodes_visited <- c(nodes_visited, children_names) + if (length(children_names)) { + for (child_name in children_names) { + # todo: make sure they are connected correctly with the keys + if (!missing(parent_name)) { + ancestors_key_pair <- x[[parent_name]][[node_name]] # !important: using x[a, b] will result in infinite loop + this_key_pair <- x[[node_name]][[child_name]] + if (!identical(unname(ancestors_key_pair), names(this_key_pair))) next + x <- c( + x, + join_key( + dataset_1 = parent_name, + dataset_2 = child_name, + keys = setNames(unname(this_key_pair), names(ancestors_key_pair)), + directed = FALSE + ) ) - jk[[d1]][[d2]] <- fk # mutate join key } + x <- .search_recursive(x = x, node_name = child_name, parent_name = node_name, nodes_visited = nodes_visited) + nodes_visited <- union(nodes_visited, names(x[[node_name]])) } } - # check parent child relation - assert_parent_child(x = jk) - - jk + x } diff --git a/man/update_keys_given_parents.Rd b/man/dot-append_indirect_links.Rd similarity index 53% rename from man/update_keys_given_parents.Rd rename to man/dot-append_indirect_links.Rd index 98a74a837..313628f25 100644 --- a/man/update_keys_given_parents.Rd +++ b/man/dot-append_indirect_links.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/join_keys-utils.R -\name{update_keys_given_parents} -\alias{update_keys_given_parents} -\title{Updates the keys of the datasets based on the parents} +\name{.append_indirect_links} +\alias{.append_indirect_links} +\title{Append indirect links} \usage{ -update_keys_given_parents(x) +.append_indirect_links(x) } \arguments{ \item{x}{(\code{join_keys}) object to update the keys.} @@ -13,6 +13,7 @@ update_keys_given_parents(x) (\code{self}) invisibly for chaining } \description{ -Updates the keys of the datasets based on the parents +Adds the keys between datasets if they are connected by the same set of keys +via other foreign datasets. } \keyword{internal} diff --git a/tests/testthat/test-join_keys-extract.R b/tests/testthat/test-join_keys-extract.R index 970de3448..b98fc0886 100644 --- a/tests/testthat/test-join_keys-extract.R +++ b/tests/testthat/test-join_keys-extract.R @@ -141,7 +141,7 @@ testthat::test_that("join_keys[i,j] returns keys for pair given by numeric indic testthat::expect_identical(my_keys[2, 1], c(`child-parent` = "child-parent")) }) -testthat::test_that("join_keys[i,j] return NULL for given pair when no such key and no common parent", { +testthat::test_that("join_keys[i,j] return NULL for given pair when not linked via foreign datasets and keys", { my_keys <- join_keys( join_key("a", "a", "aa"), join_key("b", "b", "bb"), @@ -153,7 +153,7 @@ testthat::test_that("join_keys[i,j] return NULL for given pair when no such key }) testthat::test_that( - "join_keys[i,j] doesn't infer keys between children if they don't have common key to parent", + "join_keys[i,j] doesn't infer keys between datasets if they don't refer to the same column in foreign dataset", { my_keys <- join_keys( join_key("a", "a", "aa"), @@ -167,7 +167,7 @@ testthat::test_that( ) testthat::test_that( - "join_keys[i,j] doesn't infer keys between grandchildren", + "join_keys[i,j] doesn't infer keys between grandchildren if they don't refer to the same col in the foreign dataset", { my_keys <- join_keys( join_key("a", "a", "aa"), @@ -183,7 +183,7 @@ testthat::test_that( ) testthat::test_that( - "join_keys[i,j ] infer keys between children through foreign keys to parent. ", + "join_keys[i,j ] infer keys between datasets through foreign datasets to parent. ", { my_keys <- join_keys( join_key("a", "a", "aa"), @@ -197,6 +197,20 @@ testthat::test_that( } ) +testthat::test_that( + "join_keys[i,j ] infer keys between datasets if they are linked to the same col in their common foreign dataset.", + { + my_keys <- join_keys( + join_key("a", "a", "aa", directed = FALSE), + join_key("b", "b", "bb", directed = FALSE), + join_key("c", "c", "cc", directed = FALSE), + join_key("a", "b", c("aa" = "bb")), + join_key("a", "c", c("aa" = "cc")) + ) + testthat::expect_identical(my_keys["b", "c"], c(bb = "cc")) + } +) + testthat::test_that("join_keys[i,j] returns NULL for inexisting key pair (can't even infer)", { my_keys <- join_keys( join_key("a", "a", "aa"), diff --git a/tests/testthat/test-join_keys-print.R b/tests/testthat/test-join_keys-print.R index f9aef2d6d..93d4eb4d8 100644 --- a/tests/testthat/test-join_keys-print.R +++ b/tests/testthat/test-join_keys-print.R @@ -42,7 +42,7 @@ testthat::test_that("format.join_keys for parents", { ) }) -testthat::test_that("format.join_keys print inferred keys for children sharing parent", { +testthat::test_that("format.join_keys print inferred keys for datasets linked via foreign dataset", { my_keys <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), @@ -56,8 +56,8 @@ testthat::test_that("format.join_keys print inferred keys for children sharing p paste( "A join_keys object containing foreign keys between 3 datasets:", "d1: [a]", " <-- d2: [child-a]", " <-- d3: [child-a]", - "d2: [b]", " --> d1: [child-a]", " --* (implicit via parent with): d3", - "d3: [c]", " --> d1: [child-a]", " --* (implicit via parent with): d2", + "d2: [b]", " --> d1: [child-a]", " --* (indirect via foreign dataset): d3", + "d3: [c]", " --> d1: [child-a]", " --* (indirect via foreign dataset): d2", sep = "\n" ) ) diff --git a/vignettes/join-keys.Rmd b/vignettes/join-keys.Rmd index 9ead04820..69d9d62f0 100644 --- a/vignettes/join-keys.Rmd +++ b/vignettes/join-keys.Rmd @@ -93,10 +93,10 @@ jk | `## <-- ds3: [col_1]` | | | `## ds2: [col_1, col_2]` | | | `## --> ds1: [col_1]` | arrow `-->` denotes `ds2` is a child of `ds1` | -| `## --* (implicit via parent with): ds3` | **Implicit relationship between `ds2` & `ds3`**
_(given that they share common keys with same parent)_ | +| `## --* (indirect via foreign dataset): ds3` | **Implicit relationship between `ds2` & `ds3`**
_(given that they are linked to the same keys in their common foreign dataset)_ | | `## ds3: [col_1, col_3]` | | | `## --> ds1: [col_1]` | | -| `## --* (implicit via parent with): ds2` | | +| `## --* (indirect via foreign dataset): ds2` | | | `## ds4: [no primary keys]` | | | `## <-> ds5: [col_5]` | **Foreign keys**
_(arrow `<->` denotes no parent definition between datasets)_ | | `## ds5: [no primary keys]` | | @@ -174,7 +174,7 @@ c(jk1, join_key("ds3", "ds3", "col_3")) There are 2 types of relationships encoded with _joining keys_ that are described in the following sections. The _primary_ and _foreign_ keys are created explicitly using the constructor for individual keys (`join_key`). -Additionally, the `join_keys` object infers implicit relationships when two datasets share foreign keys to a parent dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user. +Additionally, the `join_keys` object infers indirect relationships when two datasets are linked via the same keys in their foreign dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user. ### Primary Key with `teal_data` From 0da0ae9b85e3d249da2615919cadcddba450b27d Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 21 Aug 2025 11:12:29 +0000 Subject: [PATCH 2/8] [skip roxygen] [skip vbump] Roxygen Man Pages Auto Update --- man/join_keys.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/join_keys.Rd b/man/join_keys.Rd index d75489f0e..0ff481604 100644 --- a/man/join_keys.Rd +++ b/man/join_keys.Rd @@ -103,7 +103,7 @@ in a parent-child relationship and the mapping is automatically mirrored between given \code{names}, including parent \code{names} and symmetric mirror keys between \code{names} in the result. \item \code{x[i, j]}: Returns join keys between datasets \code{i} and \code{j}, -including implicit keys inferred from their relationship with a parent. +including implicit keys inferred from their relationship via foreign dataset. } From 4204bac13c1f345e04ff043ab9453db6052734cf Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Thu, 21 Aug 2025 13:23:34 +0200 Subject: [PATCH 3/8] prefix --- R/join_keys-utils.R | 2 +- tests/testthat/test-join_keys-extract.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index 9615f64e3..3f3006a08 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -138,7 +138,7 @@ assert_compatible_keys2 <- function(x, y) { join_key( dataset_1 = parent_name, dataset_2 = child_name, - keys = setNames(unname(this_key_pair), names(ancestors_key_pair)), + keys = stats::setNames(unname(this_key_pair), names(ancestors_key_pair)), directed = FALSE ) ) diff --git a/tests/testthat/test-join_keys-extract.R b/tests/testthat/test-join_keys-extract.R index b98fc0886..d28e31b00 100644 --- a/tests/testthat/test-join_keys-extract.R +++ b/tests/testthat/test-join_keys-extract.R @@ -289,7 +289,7 @@ testthat::test_that("join_keys[i,j]<- removes keys with NULL", { d1 = list(d1 = c(A = "A")), d2 = list(d2 = c(B = "B")) ), - parents = setNames(list(), character(0)), # named list + parents = stats::setNames(list(), character(0)), # named list class = c("join_keys", "list") ) ) From 205c7185fe828ca78ccbdfba87201e4800afa9d1 Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Mon, 27 Oct 2025 09:30:18 +0100 Subject: [PATCH 4/8] names --- NAMESPACE | 1 + R/join_keys-names.R | 15 +++++++++++++++ ...names-set-.join_keys.Rd => names.join_keys.Rd} | 5 ++++- 3 files changed, 20 insertions(+), 1 deletion(-) rename man/{names-set-.join_keys.Rd => names.join_keys.Rd} (83%) diff --git a/NAMESPACE b/NAMESPACE index 92d44016a..c1ace5731 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ S3method(join_keys,default) S3method(join_keys,join_keys) S3method(join_keys,teal_data) S3method(length,teal.data) +S3method(names,join_keys) S3method(names,teal_data) S3method(parents,join_keys) S3method(parents,teal_data) diff --git a/R/join_keys-names.R b/R/join_keys-names.R index ae029a76f..28dcc7b10 100644 --- a/R/join_keys-names.R +++ b/R/join_keys-names.R @@ -1,6 +1,10 @@ #' The names of a `join_keys` object #' #' @inheritParams base::`names<-` +#' @name names.join_keys + + +#' @rdname names.join_keys #' @export `names<-.join_keys` <- function(x, value) { new_x <- unclass(x) @@ -30,3 +34,14 @@ class(new_x) <- c("join_keys", "list") new_x } + +#' @rdname names.join_keys +#' @export +names.join_keys <- function(x) { + new_x <- unclass(x) + child_parent <- sapply(names(new_x), parent, x = x, USE.NAMES = TRUE, simplify = FALSE) + union( + unlist(topological_sort(child_parent)), + names(new_x) + ) +} diff --git a/man/names-set-.join_keys.Rd b/man/names.join_keys.Rd similarity index 83% rename from man/names-set-.join_keys.Rd rename to man/names.join_keys.Rd index ddcf09ecf..07c95620f 100644 --- a/man/names-set-.join_keys.Rd +++ b/man/names.join_keys.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/join_keys-names.R -\name{names<-.join_keys} +\name{names.join_keys} +\alias{names.join_keys} \alias{names<-.join_keys} \title{The names of a \code{join_keys} object} \usage{ \method{names}{join_keys}(x) <- value + +\method{names}{join_keys}(x) } \arguments{ \item{x}{an \R object.} From d0631b1053d68df2903e238e64c3f320dfa93092 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 08:33:08 +0000 Subject: [PATCH 5/8] [skip roxygen] [skip vbump] Roxygen Man Pages Auto Update --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f7bff9002..441c59b1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -56,7 +56,7 @@ Encoding: UTF-8 Language: en-US LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Collate: 'cdisc_data.R' 'data.R' From 2491946e0145a191eefdeafc57442a9aa670e833 Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Wed, 17 Dec 2025 15:55:35 +0100 Subject: [PATCH 6/8] - @osenan review - not commiting names.join_keys which confuses real items order --- NAMESPACE | 1 - R/join_keys-names.R | 11 ----------- R/join_keys-utils.R | 22 +++++++++++++++------- man/dot-append_indirect_links.Rd | 17 +++++++++++++++++ man/names.join_keys.Rd | 2 -- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c1ace5731..92d44016a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,6 @@ S3method(join_keys,default) S3method(join_keys,join_keys) S3method(join_keys,teal_data) S3method(length,teal.data) -S3method(names,join_keys) S3method(names,teal_data) S3method(parents,join_keys) S3method(parents,teal_data) diff --git a/R/join_keys-names.R b/R/join_keys-names.R index 28dcc7b10..f2afcf39c 100644 --- a/R/join_keys-names.R +++ b/R/join_keys-names.R @@ -34,14 +34,3 @@ class(new_x) <- c("join_keys", "list") new_x } - -#' @rdname names.join_keys -#' @export -names.join_keys <- function(x) { - new_x <- unclass(x) - child_parent <- sapply(names(new_x), parent, x = x, USE.NAMES = TRUE, simplify = FALSE) - union( - unlist(topological_sort(child_parent)), - names(new_x) - ) -} diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index 3f3006a08..0be7dab69 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -110,6 +110,9 @@ assert_compatible_keys2 <- function(x, y) { #' #' Adds the keys between datasets if they are connected by the same set of keys #' via other foreign datasets. +#' Algorithm searches each key pair (parent -> child) and assess if higher-level-ancestor can be +#' linked with child by common keys. +#' Algorithm is protected against infinite recursion and visits each node only once. #' #' @param x (`join_keys`) object to update the keys. #' @@ -118,17 +121,21 @@ assert_compatible_keys2 <- function(x, y) { #' @keywords internal .append_indirect_links <- function(x) { for (node_name in names(x)) { - x <- .search_recursive(x, node_name) + x <- .append_indirect_links_recursive(x, node_name) } x } -.search_recursive <- function(x, node_name, parent_name, nodes_visited = node_name) { - children_names <- setdiff(names(x[[node_name]]), nodes_visited) - nodes_visited <- c(nodes_visited, children_names) +#' @rdname dot-append_indirect_links +#' +#' @param node_name (`character`) name of the current node being processed. +#' @param parent_name (`character`) name of the parent node. If missing, no parent is considered. +#' @param nodes_visited (`character`) vector of node names that have already been visited to prevent cycles. +.append_indirect_links_recursive <- function(x, node_name, parent_name, nodes_visited = character(0)) { + children_names <- setdiff(names(x[[node_name]]), union(nodes_visited, node_name)) + nodes_visited <- union(nodes_visited, children_names) if (length(children_names)) { for (child_name in children_names) { - # todo: make sure they are connected correctly with the keys if (!missing(parent_name)) { ancestors_key_pair <- x[[parent_name]][[node_name]] # !important: using x[a, b] will result in infinite loop this_key_pair <- x[[node_name]][[child_name]] @@ -143,8 +150,9 @@ assert_compatible_keys2 <- function(x, y) { ) ) } - x <- .search_recursive(x = x, node_name = child_name, parent_name = node_name, nodes_visited = nodes_visited) - nodes_visited <- union(nodes_visited, names(x[[node_name]])) + x <- .append_indirect_links_recursive( + x = x, node_name = child_name, parent_name = node_name, nodes_visited = nodes_visited + ) } } x diff --git a/man/dot-append_indirect_links.Rd b/man/dot-append_indirect_links.Rd index 313628f25..ef3fd5793 100644 --- a/man/dot-append_indirect_links.Rd +++ b/man/dot-append_indirect_links.Rd @@ -2,12 +2,26 @@ % Please edit documentation in R/join_keys-utils.R \name{.append_indirect_links} \alias{.append_indirect_links} +\alias{.append_indirect_links_recursive} \title{Append indirect links} \usage{ .append_indirect_links(x) + +.append_indirect_links_recursive( + x, + node_name, + parent_name, + nodes_visited = character(0) +) } \arguments{ \item{x}{(\code{join_keys}) object to update the keys.} + +\item{node_name}{(\code{character}) name of the current node being processed.} + +\item{parent_name}{(\code{character}) name of the parent node. If missing, no parent is considered.} + +\item{nodes_visited}{(\code{character}) vector of node names that have already been visited to prevent cycles.} } \value{ (\code{self}) invisibly for chaining @@ -15,5 +29,8 @@ \description{ Adds the keys between datasets if they are connected by the same set of keys via other foreign datasets. +Algorithm searches each key pair (parent -> child) and assess if higher-level-ancestor can be +linked with child by common keys. +Algorithm is protected against infinite recursion and visits each node only once. } \keyword{internal} diff --git a/man/names.join_keys.Rd b/man/names.join_keys.Rd index 07c95620f..2351b4d1d 100644 --- a/man/names.join_keys.Rd +++ b/man/names.join_keys.Rd @@ -6,8 +6,6 @@ \title{The names of a \code{join_keys} object} \usage{ \method{names}{join_keys}(x) <- value - -\method{names}{join_keys}(x) } \arguments{ \item{x}{an \R object.} From 57977c7d7bbdc9f85e03c91d3d56bbf82e566f20 Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Wed, 17 Dec 2025 16:16:41 +0100 Subject: [PATCH 7/8] lintr --- R/join_keys-utils.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index 0be7dab69..37d6d69aa 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -131,7 +131,10 @@ assert_compatible_keys2 <- function(x, y) { #' @param node_name (`character`) name of the current node being processed. #' @param parent_name (`character`) name of the parent node. If missing, no parent is considered. #' @param nodes_visited (`character`) vector of node names that have already been visited to prevent cycles. -.append_indirect_links_recursive <- function(x, node_name, parent_name, nodes_visited = character(0)) { +.append_indirect_links_recursive <- function(x, # nolint: object_length_linter + node_name, + parent_name, + nodes_visited = character(0)) { children_names <- setdiff(names(x[[node_name]]), union(nodes_visited, node_name)) nodes_visited <- union(nodes_visited, children_names) if (length(children_names)) { From 452b6dd3824feab531f99552fe8e63d4841e43eb Mon Sep 17 00:00:00 2001 From: Dawid Kaledkowski Date: Fri, 2 Jan 2026 12:18:17 +0100 Subject: [PATCH 8/8] fix performance --- R/join_keys-utils.R | 16 ++++++++++------ man/dot-append_indirect_links.Rd | 9 ++------- tests/testthat/test-join_keys-print.R | 14 +++++++++----- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index 37d6d69aa..b5729a71e 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -120,8 +120,12 @@ assert_compatible_keys2 <- function(x, y) { #' #' @keywords internal .append_indirect_links <- function(x) { + # environment to track visited nodes + visited_env <- new.env(parent = emptyenv()) + visited_env$nodes <- character(0) + for (node_name in names(x)) { - x <- .append_indirect_links_recursive(x, node_name) + x <- .append_indirect_links_recursive(x, node_name, visited_env = visited_env) } x } @@ -130,13 +134,13 @@ assert_compatible_keys2 <- function(x, y) { #' #' @param node_name (`character`) name of the current node being processed. #' @param parent_name (`character`) name of the parent node. If missing, no parent is considered. -#' @param nodes_visited (`character`) vector of node names that have already been visited to prevent cycles. +#' @param visited_env (`environment`) environment containing visited nodes to prevent cycles. .append_indirect_links_recursive <- function(x, # nolint: object_length_linter node_name, parent_name, - nodes_visited = character(0)) { - children_names <- setdiff(names(x[[node_name]]), union(nodes_visited, node_name)) - nodes_visited <- union(nodes_visited, children_names) + visited_env) { + children_names <- setdiff(names(x[[node_name]]), union(visited_env$nodes, node_name)) + visited_env$nodes <- union(visited_env$nodes, children_names) if (length(children_names)) { for (child_name in children_names) { if (!missing(parent_name)) { @@ -154,7 +158,7 @@ assert_compatible_keys2 <- function(x, y) { ) } x <- .append_indirect_links_recursive( - x = x, node_name = child_name, parent_name = node_name, nodes_visited = nodes_visited + x = x, node_name = child_name, parent_name = node_name, visited_env = visited_env ) } } diff --git a/man/dot-append_indirect_links.Rd b/man/dot-append_indirect_links.Rd index ef3fd5793..ec0299f18 100644 --- a/man/dot-append_indirect_links.Rd +++ b/man/dot-append_indirect_links.Rd @@ -7,12 +7,7 @@ \usage{ .append_indirect_links(x) -.append_indirect_links_recursive( - x, - node_name, - parent_name, - nodes_visited = character(0) -) +.append_indirect_links_recursive(x, node_name, parent_name, visited_env) } \arguments{ \item{x}{(\code{join_keys}) object to update the keys.} @@ -21,7 +16,7 @@ \item{parent_name}{(\code{character}) name of the parent node. If missing, no parent is considered.} -\item{nodes_visited}{(\code{character}) vector of node names that have already been visited to prevent cycles.} +\item{visited_env}{(\code{environment}) environment containing visited nodes to prevent cycles.} } \value{ (\code{self}) invisibly for chaining diff --git a/tests/testthat/test-join_keys-print.R b/tests/testthat/test-join_keys-print.R index 93d4eb4d8..a3fd2330d 100644 --- a/tests/testthat/test-join_keys-print.R +++ b/tests/testthat/test-join_keys-print.R @@ -43,12 +43,16 @@ testthat::test_that("format.join_keys for parents", { }) testthat::test_that("format.join_keys print inferred keys for datasets linked via foreign dataset", { + testthat::skip( + "can find indirect link when extracting x[a, b] but due to performance reasons + format doesn't show indirect links anymore" + ) my_keys <- join_keys( - join_key("d1", "d1", "a"), - join_key("d2", "d2", "b"), - join_key("d3", "d3", "c"), - join_key("d1", "d2", "child-a"), - join_key("d1", "d3", "child-a") + join_key("d1", "d1", "id"), + join_key("d2", "d2", "id"), + join_key("d3", "d3", "id"), + join_key("d1", "d2", keys = c(id = "parent_id")), + join_key("d1", "d3", keys = c(id = "parent_id")) ) testthat::expect_identical(