Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/join_keys-extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' given `names`, including parent `names` and symmetric mirror keys between
#' `names` in the result.
#' - `x[i, j]`: Returns join keys between datasets `i` and `j`,
#' including implicit keys inferred from their relationship with a parent.
#' including implicit keys inferred from their relationship via foreign dataset.
#'
#' @param i,j indices specifying elements to extract or replace. Index should be a
#' a character vector, but it can also take numeric, logical, `NULL` or missing.
Expand Down Expand Up @@ -56,7 +56,7 @@
if (is.numeric(i)) i <- names(x)[i]
if (is.numeric(j)) j <- names(x)[j]

subset_x <- update_keys_given_parents(x[union(i, j)])
subset_x <- .append_indirect_links(x[union(i, j)])
return(subset_x[[i]][[j]])
} else if (!missing(j)) {
# ie. select all keys which have j as dataset_2
Expand Down
4 changes: 4 additions & 0 deletions R/join_keys-names.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#' The names of a `join_keys` object
#'
#' @inheritParams base::`names<-`
#' @name names.join_keys


#' @rdname names.join_keys
#' @export
`names<-.join_keys` <- function(x, value) {
new_x <- unclass(x)
Expand Down
10 changes: 5 additions & 5 deletions R/join_keys-print.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ format.join_keys <- function(x, ...) {
my_parents <- parents(x)
names_sorted <- topological_sort(my_parents)
names <- union(names_sorted, names(x))
x_implicit <- update_keys_given_parents(x)
x_indirect <- .append_indirect_links(x)
out <- lapply(names, function(i) {
out_i <- lapply(union(i, names(x[[i]])), function(j) {
direction <- if (identical(my_parents[[j]], i)) {
Expand All @@ -27,13 +27,13 @@ format.join_keys <- function(x, ...) {
)
})

implicit_datasets <- setdiff(names(x_implicit[[i]]), names(x[[i]]))
if (length(implicit_datasets) > 0) {
indirect_datasets <- setdiff(names(x_indirect[[i]]), names(x[[i]]))
if (length(indirect_datasets) > 0) {
out_i <- c(
out_i,
paste0(
" --* (implicit via parent with): ",
paste(implicit_datasets, collapse = ", ")
" --* (indirect via foreign dataset): ",
paste(indirect_datasets, collapse = ", ")
)
)
}
Expand Down
77 changes: 45 additions & 32 deletions R/join_keys-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,48 +106,61 @@ assert_compatible_keys2 <- function(x, y) {
TRUE
}

#' Updates the keys of the datasets based on the parents
#' Append indirect links
#'
#' Adds the keys between datasets if they are connected by the same set of keys
#' via other foreign datasets.
#' Algorithm searches each key pair (parent -> child) and assess if higher-level-ancestor can be
#' linked with child by common keys.
#' Algorithm is protected against infinite recursion and visits each node only once.
#'
#' @param x (`join_keys`) object to update the keys.
#'
#' @return (`self`) invisibly for chaining
#'
#' @keywords internal
update_keys_given_parents <- function(x) {
jk <- join_keys(x)

checkmate::assert_class(jk, "join_keys", .var.name = checkmate::vname(x))

datanames <- names(jk)
for (d1_ix in seq_along(datanames)) {
d1 <- datanames[[d1_ix]]
d1_parent <- parent(jk, d1)
for (d2 in datanames[-1 * seq.int(d1_ix)]) {
if (length(jk[[d1]][[d2]]) == 0) {
d2_parent <- parent(jk, d2)

if (!identical(d1_parent, d2_parent) || length(d1_parent) == 0) next

# both has the same parent -> common keys to parent
keys_d1_parent <- sort(jk[[d1]][[d1_parent]])
keys_d2_parent <- sort(jk[[d2]][[d2_parent]])
.append_indirect_links <- function(x) {
# environment to track visited nodes
visited_env <- new.env(parent = emptyenv())
visited_env$nodes <- character(0)

common_ix_1 <- unname(keys_d1_parent) %in% unname(keys_d2_parent)
common_ix_2 <- unname(keys_d2_parent) %in% unname(keys_d1_parent)

# No common keys between datasets - leave empty
if (all(!common_ix_1)) next
for (node_name in names(x)) {
x <- .append_indirect_links_recursive(x, node_name, visited_env = visited_env)
}
x
}

fk <- structure(
names(keys_d2_parent)[common_ix_2],
names = names(keys_d1_parent)[common_ix_1]
#' @rdname dot-append_indirect_links
#'
#' @param node_name (`character`) name of the current node being processed.
#' @param parent_name (`character`) name of the parent node. If missing, no parent is considered.
#' @param visited_env (`environment`) environment containing visited nodes to prevent cycles.
.append_indirect_links_recursive <- function(x, # nolint: object_length_linter
node_name,
parent_name,
visited_env) {
children_names <- setdiff(names(x[[node_name]]), union(visited_env$nodes, node_name))
visited_env$nodes <- union(visited_env$nodes, children_names)
if (length(children_names)) {
for (child_name in children_names) {
if (!missing(parent_name)) {
ancestors_key_pair <- x[[parent_name]][[node_name]] # !important: using x[a, b] will result in infinite loop
this_key_pair <- x[[node_name]][[child_name]]
if (!identical(unname(ancestors_key_pair), names(this_key_pair))) next
x <- c(
x,
join_key(
dataset_1 = parent_name,
dataset_2 = child_name,
keys = stats::setNames(unname(this_key_pair), names(ancestors_key_pair)),
directed = FALSE
)
)
jk[[d1]][[d2]] <- fk # mutate join key
}
x <- .append_indirect_links_recursive(
x = x, node_name = child_name, parent_name = node_name, visited_env = visited_env
)
}
}
# check parent child relation
assert_parent_child(x = jk)

jk
x
}
31 changes: 31 additions & 0 deletions man/dot-append_indirect_links.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/join_keys.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/names-set-.join_keys.Rd → man/names.join_keys.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 0 additions & 18 deletions man/update_keys_given_parents.Rd

This file was deleted.

24 changes: 19 additions & 5 deletions tests/testthat/test-join_keys-extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ testthat::test_that("join_keys[i,j] returns keys for pair given by numeric indic
testthat::expect_identical(my_keys[2, 1], c(`child-parent` = "child-parent"))
})

testthat::test_that("join_keys[i,j] return NULL for given pair when no such key and no common parent", {
testthat::test_that("join_keys[i,j] return NULL for given pair when not linked via foreign datasets and keys", {
my_keys <- join_keys(
join_key("a", "a", "aa"),
join_key("b", "b", "bb"),
Expand All @@ -153,7 +153,7 @@ testthat::test_that("join_keys[i,j] return NULL for given pair when no such key
})

testthat::test_that(
"join_keys[i,j] doesn't infer keys between children if they don't have common key to parent",
"join_keys[i,j] doesn't infer keys between datasets if they don't refer to the same column in foreign dataset",
{
my_keys <- join_keys(
join_key("a", "a", "aa"),
Expand All @@ -167,7 +167,7 @@ testthat::test_that(
)

testthat::test_that(
"join_keys[i,j] doesn't infer keys between grandchildren",
"join_keys[i,j] doesn't infer keys between grandchildren if they don't refer to the same col in the foreign dataset",
{
my_keys <- join_keys(
join_key("a", "a", "aa"),
Expand All @@ -183,7 +183,7 @@ testthat::test_that(
)

testthat::test_that(
"join_keys[i,j ] infer keys between children through foreign keys to parent. ",
"join_keys[i,j ] infer keys between datasets through foreign datasets to parent. ",
{
my_keys <- join_keys(
join_key("a", "a", "aa"),
Expand All @@ -197,6 +197,20 @@ testthat::test_that(
}
)

testthat::test_that(
"join_keys[i,j ] infer keys between datasets if they are linked to the same col in their common foreign dataset.",
{
my_keys <- join_keys(
join_key("a", "a", "aa", directed = FALSE),
join_key("b", "b", "bb", directed = FALSE),
join_key("c", "c", "cc", directed = FALSE),
join_key("a", "b", c("aa" = "bb")),
join_key("a", "c", c("aa" = "cc"))
)
testthat::expect_identical(my_keys["b", "c"], c(bb = "cc"))
}
)

testthat::test_that("join_keys[i,j] returns NULL for inexisting key pair (can't even infer)", {
my_keys <- join_keys(
join_key("a", "a", "aa"),
Expand Down Expand Up @@ -275,7 +289,7 @@ testthat::test_that("join_keys[i,j]<- removes keys with NULL", {
d1 = list(d1 = c(A = "A")),
d2 = list(d2 = c(B = "B"))
),
parents = setNames(list(), character(0)), # named list
parents = stats::setNames(list(), character(0)), # named list
class = c("join_keys", "list")
)
)
Expand Down
20 changes: 12 additions & 8 deletions tests/testthat/test-join_keys-print.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,26 @@ testthat::test_that("format.join_keys for parents", {
)
})

testthat::test_that("format.join_keys print inferred keys for children sharing parent", {
testthat::test_that("format.join_keys print inferred keys for datasets linked via foreign dataset", {
testthat::skip(
"can find indirect link when extracting x[a, b] but due to performance reasons
format doesn't show indirect links anymore"
)
my_keys <- join_keys(
join_key("d1", "d1", "a"),
join_key("d2", "d2", "b"),
join_key("d3", "d3", "c"),
join_key("d1", "d2", "child-a"),
join_key("d1", "d3", "child-a")
join_key("d1", "d1", "id"),
join_key("d2", "d2", "id"),
join_key("d3", "d3", "id"),
join_key("d1", "d2", keys = c(id = "parent_id")),
join_key("d1", "d3", keys = c(id = "parent_id"))
)

testthat::expect_identical(
format(my_keys),
paste(
"A join_keys object containing foreign keys between 3 datasets:",
"d1: [a]", " <-- d2: [child-a]", " <-- d3: [child-a]",
"d2: [b]", " --> d1: [child-a]", " --* (implicit via parent with): d3",
"d3: [c]", " --> d1: [child-a]", " --* (implicit via parent with): d2",
"d2: [b]", " --> d1: [child-a]", " --* (indirect via foreign dataset): d3",
"d3: [c]", " --> d1: [child-a]", " --* (indirect via foreign dataset): d2",
sep = "\n"
)
)
Expand Down
6 changes: 3 additions & 3 deletions vignettes/join-keys.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ jk
| `## <-- ds3: [col_1]` | |
| `## ds2: [col_1, col_2]` | |
| `## --> ds1: [col_1]` | arrow `-->` denotes `ds2` is a child of `ds1` |
| `## --* (implicit via parent with): ds3` | **Implicit relationship between `ds2` & `ds3`**<br />_(given that they share common keys with same parent)_ |
| `## --* (indirect via foreign dataset): ds3` | **Implicit relationship between `ds2` & `ds3`**<br />_(given that they are linked to the same keys in their common foreign dataset)_ |
| `## ds3: [col_1, col_3]` | |
| `## --> ds1: [col_1]` | |
| `## --* (implicit via parent with): ds2` | |
| `## --* (indirect via foreign dataset): ds2` | |
| `## ds4: [no primary keys]` | |
| `## <-> ds5: [col_5]` | **Foreign keys** <br />_(arrow `<->` denotes no parent definition between datasets)_ |
| `## ds5: [no primary keys]` | |
Expand Down Expand Up @@ -174,7 +174,7 @@ c(jk1, join_key("ds3", "ds3", "col_3"))

There are 2 types of relationships encoded with _joining keys_ that are described in the following sections. The _primary_ and _foreign_ keys are created explicitly using the constructor for individual keys (`join_key`).

Additionally, the `join_keys` object infers implicit relationships when two datasets share foreign keys to a parent dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user.
Additionally, the `join_keys` object infers indirect relationships when two datasets are linked via the same keys in their foreign dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user.

### Primary Key with `teal_data`

Expand Down
Loading