diff --git a/NAMESPACE b/NAMESPACE index 9ba04444..a18217f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,12 +7,15 @@ S3method(as.statistics_textmodel,data.frame) S3method(as.statistics_textmodel,matrix) S3method(as.textmodel_lss,matrix) S3method(as.textmodel_lss,numeric) +S3method(as.textmodel_lss,textmodel_doc2vec) S3method(as.textmodel_lss,textmodel_lss) +S3method(as.textmodel_lss,textmodel_word2vec) S3method(as.textmodel_lss,textmodel_wordvector) S3method(coef,textmodel_lss) S3method(diagnosys,character) S3method(diagnosys,corpus) S3method(predict,textmodel_lss) +S3method(predict,textmodel_lss3) S3method(print,coefficients_textmodel) S3method(print,statistics_textmodel) S3method(print,summary.textmodel) @@ -35,6 +38,7 @@ export(coefficients.textmodel_lss) export(cohesion) export(diagnosys) export(optimize_lss) +export(rowMaxs) export(seedwords) export(smooth_lss) export(textmodel_lss) diff --git a/R/as.textmodel.R b/R/as.textmodel.R index 53789ce8..f975f96c 100644 --- a/R/as.textmodel.R +++ b/R/as.textmodel.R @@ -30,7 +30,6 @@ as.textmodel_lss.matrix <- function(x, seeds, nested_weight = TRUE, verbose = FALSE, ...) { - args <- list(terms = terms, seeds = seeds) if (is.null(colnames(x))) stop("x must have column names for features") if (any(is.na(colnames(x)))) @@ -38,8 +37,8 @@ as.textmodel_lss.matrix <- function(x, seeds, if (any(is.na(x))) stop("x must not have NA") - seeds <- expand_seeds(seeds, colnames(x), nested_weight, verbose) - seed <- unlist(unname(seeds)) + s <- expand_seeds(seeds, colnames(x), nested_weight, verbose) + seed <- unlist(unname(s)) theta <- get_theta(terms, colnames(x)) if (is.null(slice)) { @@ -57,8 +56,8 @@ as.textmodel_lss.matrix <- function(x, seeds, beta = beta, k = nrow(x), slice = slice, - terms = args$terms, - seeds = args$seeds, + terms = terms, + seeds = seeds, seeds_weighted = seed, embedding = x, similarity = simil$seed, @@ -100,66 +99,3 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) { result$frequency <- x$frequency[names(result$beta)] return(result) } - -#' @rdname as.textmodel_lss -#' @export -#' @method as.textmodel_lss textmodel_wordvector -as.textmodel_lss.textmodel_wordvector <- function(x, seeds, - terms = NULL, - nested_weight = TRUE, - verbose = FALSE, - spatial = TRUE, - ...) { - - #args <- list(terms = terms, seeds = seeds) - spatial <- check_logical(spatial) - if (spatial) { - - if (x$version == as.numeric_version("0.1.0")) { - values <- x$vector - } else if (x$version >= as.numeric_version("0.6.0")) { - values <- x$values$word - } else { - values <- x$values - } - result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms, - nested_weight = nested_weight, ...) - result$frequency <- x$frequency[names(result$beta)] - result$type = "word2vec" - result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) - - } else { - - if (!requireNamespace("wordvector")) - stop("wordvector package must be installed") - if (x$version < as.numeric_version("0.2.0")) - stop("wordvector package must be v0.2.0 or later") - - seeds <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose) - seed <- unlist(unname(seeds)) - theta <- get_theta(terms, names(x$frequency)) - - suppressWarnings({ - if (packageVersion("wordvector") >= "0.6.0") { - prob <- wordvector::probability(x, names(seed), mode = "numeric") - } else { - prob <- wordvector::probability(x, names(seed), mode = "values") - } - }) - beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta - - result <- build_lss( - beta = beta, - beta_type = "probability", - k = x$dim, - terms = terms, - seeds = seeds, - seeds_weighted = seed, - frequency = x$frequency[names(beta)], - type = "word2vec", - spatial = FALSE, - call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) - ) - } - return(result) -} diff --git a/R/as.textmodel2.R b/R/as.textmodel2.R new file mode 100644 index 00000000..15677536 --- /dev/null +++ b/R/as.textmodel2.R @@ -0,0 +1,72 @@ + +#' @rdname as.textmodel_lss +#' @export +#' @method as.textmodel_lss textmodel_word2vec +as.textmodel_lss.textmodel_word2vec <- function(x, seeds, + terms = NULL, + nested_weight = TRUE, + verbose = FALSE, + spatial = TRUE, + ...) { + + #args <- list(terms = terms, seeds = seeds) + spatial <- check_logical(spatial) + if (spatial) { + + if (x$version == as.numeric_version("0.1.0")) { + values <- x$vector + } else if (x$version >= as.numeric_version("0.6.0")) { + values <- x$values$word + } else { + values <- x$values + } + result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms, + nested_weight = nested_weight, ...) + result$frequency <- x$frequency[names(result$beta)] + result$concatenator <- x$concatenator + result$type <- "word2vec" + result$call <- try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) + + } else { + + if (!requireNamespace("wordvector")) + stop("wordvector package must be installed") + if (x$version < as.numeric_version("0.2.0")) + stop("wordvector package must be v0.2.0 or later") + + s <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose) + seed <- unlist(unname(s)) + theta <- get_theta(terms, names(x$frequency)) + + suppressWarnings({ + if (packageVersion("wordvector") >= "0.6.0") { + prob <- wordvector::probability(x, names(seed), mode = "numeric") + } else { + prob <- wordvector::probability(x, names(seed), mode = "values") + } + }) + beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta + + result <- build_lss( + beta = beta, + beta_type = "probability", + k = x$dim, + terms = terms, + seeds = seeds, + seeds_weighted = seed, + frequency = x$frequency[names(beta)], + concatenator = x$concatenator, + type = "word2vec", + spatial = FALSE, + call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) + ) + } + return(result) +} + +#' Support wordvector version 0.5.1 or earlier +#' @export +#' @method as.textmodel_lss textmodel_wordvector +#' @keywords internal +as.textmodel_lss.textmodel_wordvector <- as.textmodel_lss.textmodel_word2vec + diff --git a/R/as.textmodel3.R b/R/as.textmodel3.R new file mode 100644 index 00000000..5cb9fa6c --- /dev/null +++ b/R/as.textmodel3.R @@ -0,0 +1,76 @@ +#' @noRd +#' @export +#' @keywords internal +#' @param prob_mode select how to compute polarity for documents. See details. +#' @details +#' If `x` is a [wordvector::textmodel_doc2vec] object, it computes +#' polarity scores of documents based on their probabilities for seed words. +#' If `prob_mode = "mean"`, it averages the probabilities, but it keeps used only +#' the highest probability for one of the seed words if `"max"`. +#' +#' @method as.textmodel_lss textmodel_doc2vec +as.textmodel_lss.textmodel_doc2vec <- function(x, seeds, prob_mode = c("mean", "max"), + ...) { + + prob_mode <- match.arg(prob_mode) + + if (!requireNamespace("wordvector")) + stop("wordvector package must be installed") + if (x$version < as.numeric_version("0.6.0")) + stop("wordvector package must be v0.6.0 or later") + + s <- expand_seeds(seeds, names(x$frequency), nested_weight = FALSE) + seed <- unlist(unname(s)) + prob <- wordvector::probability(x, names(seed), layer = "document", mode = "numeric") + + if (prob_mode == "max") { + alpha <- rowMaxs(prob %*% diag(sign(seed)), absolute = TRUE) + } else { + alpha <- rowSums(prob %*% diag(seed)) + } + + # pseudo beta + if (is.null(x$data)) { + freq <- x$frequency + beta <- NULL + } else { + data <- dfm(x$data, remove_padding = TRUE) + data <- dfm_select(data, names(x$frequency), valuetype = "fixed", + case_insensitive = TRUE) + freq <- colSums(data) + beta <- colSums(data * alpha) / freq + } + + result <- build_lss( + seeds = seeds, + seeds_weighted = seed, + beta = beta, + beta_type = "dummy", + terms = names(freq), + frequency = freq, + concatenator = x$concatenator, + type = "doc2vec", + spatial = FALSE, + call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) + ) + # extra information + result$prob_mode <- prob_mode + result$alpha <- alpha + result$length <- x$ntoken + + class(result) <- c("textmodel_lss3", class(result)) + return(result) + +} + +#' @export +#' @keywords internal +#' @method predict textmodel_lss3 +predict.textmodel_lss3 <- function(object, min_n = 0L, ...) { + n <- object$length + p <- object$alpha + if (min_n > 0) + p <- p * (n / pmax(n, min_n)) + p[n == 0] <- NA_real_ + return(p) +} diff --git a/R/textmodel_lss.R b/R/textmodel_lss.R index 8085d814..1fc32ef0 100644 --- a/R/textmodel_lss.R +++ b/R/textmodel_lss.R @@ -106,8 +106,8 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL, k <- check_integer(k, min_len = 1, max_len = 1, min = 2, max = nrow(x)) engine <- match.arg(engine) - seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose) - seed <- unlist(unname(seeds)) + s <- expand_seeds(seeds, featnames(x), nested_weight, verbose) + seed <- unlist(unname(s)) theta <- get_theta(terms, featnames(x)) feat <- union(names(theta), names(seed)) @@ -136,11 +136,11 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL, k = k, slice = slice, frequency = colSums(x)[names(beta)], - terms = args$terms, - seeds = args$seeds, + terms = terms, + seeds = seeds, seeds_weighted = seed, embedding = embed, - similarity = simil$seed, + similarity = simil$seed, # TODO: remove concatenator = meta(x, field = "concatenator", type = "object"), type = "svd", call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE), @@ -268,7 +268,7 @@ expand_seeds <- function(seeds, features, nested_weight = TRUE, verbose = FALSE) seeds_weighted <- weight_seeds(seeds, features, nested_weight) if (all(lengths(seeds_weighted) == 0)) - stop("No seed word is found in the dfm", call. = FALSE) + stop("Seed words are not found in x", call. = FALSE) return(seeds_weighted) } @@ -386,7 +386,7 @@ get_seeds <- function(seeds) { seeds <- structure(rep(1, length(seeds)), names = seeds) if (is.null(names(seeds))) - stop("y must be a named-numerid vector\n", call. = FALSE) + stop("seeds must be a character or named-numeric vector\n", call. = FALSE) return(seeds) } diff --git a/R/textplot.R b/R/textplot.R index e39457b8..02cb2475 100644 --- a/R/textplot.R +++ b/R/textplot.R @@ -70,6 +70,9 @@ textplot_terms.textmodel_lss <- function(x, highlighted = NULL, max_highlighted <- check_integer(max_highlighted, min = 0) sampling <- match.arg(sampling) + if (x$type == "doc2vec" && is.null(x$beta)) + stop("x must be trained with include_data = TRUE to compute pseudo word polarity") + x$frequency <- x$frequency[names(x$beta)] # fix for < v1.1.4 x$frequency[is.na(x$frequency)] <- 0 diff --git a/R/utils.R b/R/utils.R index 809adf28..e0bc95ff 100644 --- a/R/utils.R +++ b/R/utils.R @@ -230,3 +230,16 @@ print.textmodel_lss <- function(x, ...) { print(x$call) cat("\n") } + +#' Maximum values in each row of a matrix +#' @param x a matrix object. +#' @param absolute if `TRUE`, return largest positive or negative values. +#' @export +#' @keywords internal +rowMaxs <- function(x, absolute = FALSE) { + stopifnot(is.matrix(x)) + y <- if (absolute) abs(x) else x + structure(x[cbind(seq_len(nrow(x)), max.col(y, "first"))], + names = rownames(x)) +} + diff --git a/man/as.textmodel_lss.Rd b/man/as.textmodel_lss.Rd index 1afa11d8..f3913c6a 100644 --- a/man/as.textmodel_lss.Rd +++ b/man/as.textmodel_lss.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as.textmodel.R +% Please edit documentation in R/as.textmodel.R, R/as.textmodel2.R \name{as.textmodel_lss} \alias{as.textmodel_lss} \alias{as.textmodel_lss.matrix} \alias{as.textmodel_lss.numeric} \alias{as.textmodel_lss.textmodel_lss} -\alias{as.textmodel_lss.textmodel_wordvector} +\alias{as.textmodel_lss.textmodel_word2vec} \title{Create a Latent Semantic Scaling model from various objects} \usage{ as.textmodel_lss(x, ...) @@ -25,7 +25,7 @@ as.textmodel_lss(x, ...) \method{as.textmodel_lss}{textmodel_lss}(x, ...) -\method{as.textmodel_lss}{textmodel_wordvector}( +\method{as.textmodel_lss}{textmodel_word2vec}( x, seeds, terms = NULL, diff --git a/man/as.textmodel_lss.textmodel_wordvector.Rd b/man/as.textmodel_lss.textmodel_wordvector.Rd new file mode 100644 index 00000000..c351f5b4 --- /dev/null +++ b/man/as.textmodel_lss.textmodel_wordvector.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as.textmodel2.R +\name{as.textmodel_lss.textmodel_wordvector} +\alias{as.textmodel_lss.textmodel_wordvector} +\title{Support wordvector version 0.5.1 or earlier} +\usage{ +\method{as.textmodel_lss}{textmodel_wordvector}( + x, + seeds, + terms = NULL, + nested_weight = TRUE, + verbose = FALSE, + spatial = TRUE, + ... +) +} +\description{ +Support wordvector version 0.5.1 or earlier +} +\keyword{internal} diff --git a/man/rowMaxs.Rd b/man/rowMaxs.Rd new file mode 100644 index 00000000..ccf5bed9 --- /dev/null +++ b/man/rowMaxs.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{rowMaxs} +\alias{rowMaxs} +\title{Maximum values in each row of a matrix} +\usage{ +rowMaxs(x, absolute = FALSE) +} +\arguments{ +\item{x}{a matrix object.} + +\item{absolute}{if \code{TRUE}, return largest positive or negative values.} +} +\description{ +Maximum values in each row of a matrix +} +\keyword{internal} diff --git a/tests/data/doc2vec.RDS b/tests/data/doc2vec.RDS new file mode 100644 index 00000000..d83dac98 Binary files /dev/null and b/tests/data/doc2vec.RDS differ diff --git a/tests/data/save.R b/tests/data/save.R index b52db013..544cd385 100644 --- a/tests/data/save.R +++ b/tests/data/save.R @@ -12,8 +12,12 @@ seed <- as.seedwords(data_dictionary_sentiment) lss <- textmodel_lss(dfmt, seed, terms = feat, k = 300) saveRDS(lss, "tests/data/lss_k300.RDS") -wdv <- textmodel_word2vec(head(toks, 10), min_count = 1) -saveRDS(wdv, "tests/data/word2vec.RDS") +wov <- textmodel_word2vec(head(toks, 10), min_count = 1) +saveRDS(wov, "tests/data/word2vec.RDS") -wdv2 <- textmodel_word2vec(head(toks, 10), min_count = 1, normalize = FALSE) -saveRDS(wdv2, "tests/data/word2vec-prob.RDS") +wov2 <- textmodel_word2vec(head(toks, 10), min_count = 1, normalize = FALSE) +saveRDS(wov2, "tests/data/word2vec-prob.RDS") + +dov <- textmodel_doc2vec(head(toks, 100), type = "dbow", min_count = 1, + include_data = TRUE) +saveRDS(dov, "tests/data/doc2vec.RDS") diff --git a/tests/testthat/test-as.textmodel.R b/tests/testthat/test-as.textmodel.R index 932765f4..1e26e0e5 100644 --- a/tests/testthat/test-as.textmodel.R +++ b/tests/testthat/test-as.textmodel.R @@ -4,8 +4,8 @@ mat_test <- readRDS("../data/matrix_k100.RDS") toks_test <- readRDS("../data/tokens.RDS") feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100) dfmt_test <- dfm(toks_test) -seed <- as.seedwords(data_dictionary_sentiment) -lss_test <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 50, +seed_test <- as.seedwords(data_dictionary_sentiment) +lss_test <- textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 50, include_data = FALSE) test_that("as.textmodel_lss works with matrix", { @@ -13,7 +13,7 @@ test_that("as.textmodel_lss works with matrix", { term <- c("decision", "instance", "universal", "foundations", "the") # with terms - lss1 <- as.textmodel_lss(mat_test, seed, term) + lss1 <- as.textmodel_lss(mat_test, seed_test, term) expect_equal(names(lss1), names(LSX:::build_lss())) expect_identical(lss1$embedding, mat_test) expect_false(any(duplicated(names(coef(lss1))))) @@ -23,7 +23,7 @@ test_that("as.textmodel_lss works with matrix", { is.na(pred1)) # without terms - lss2 <- as.textmodel_lss(mat_test, seed) + lss2 <- as.textmodel_lss(mat_test, seed_test) expect_equal(names(lss2), names(LSX:::build_lss())) expect_identical(lss2$embedding, mat_test) expect_false(any(duplicated(names(coef(lss2))))) @@ -35,54 +35,54 @@ test_that("as.textmodel_lss works with matrix", { # with special features mat_special <- mat_test colnames(mat_special)[1:2] <- c("", "*") - lss3 <- as.textmodel_lss(mat_special, seed) + lss3 <- as.textmodel_lss(mat_special, seed_test) expect_equal(sum("" == names(coef(lss3))), 0) expect_equal(sum("*" == names(coef(lss3))), 1) # with slice - lss4 <- as.textmodel_lss(mat_test, seed, slice = 50) + lss4 <- as.textmodel_lss(mat_test, seed_test, slice = 50) expect_error( - as.textmodel_lss(mat_test, seed, slice = 150), + as.textmodel_lss(mat_test, seed_test, slice = 150), "The value of slice must be between 1 and 100" ) expect_error( - as.textmodel_lss(mat_test, seed, slice = 1:150), + as.textmodel_lss(mat_test, seed_test, slice = 1:150), "The length of slice must be between 1 and 100" ) expect_identical(coef(lss4), - coef(as.textmodel_lss(mat_test, seed, slice = 1:50))) + coef(as.textmodel_lss(mat_test, seed_test, slice = 1:50))) expect_identical(lss4$embedding, mat_test) }) test_that("as.textmodel_lss errors with invalid columns", { - seed <- as.seedwords(data_dictionary_sentiment) + mat_nocol <- mat_nacol <- mat_na <- mat_test colnames(mat_nocol) <- NULL - expect_error(as.textmodel_lss(mat_nocol, seed), + expect_error(as.textmodel_lss(mat_nocol, seed_test), "x must have column names for features") colnames(mat_nacol)[1] <- NA - expect_error(as.textmodel_lss(mat_nacol, seed), + expect_error(as.textmodel_lss(mat_nacol, seed_test), "x must not have NA in the column names") mat_na[1,1] <- NA - expect_error(as.textmodel_lss(mat_na, seed), + expect_error(as.textmodel_lss(mat_na, seed_test), "x must not have NA") }) test_that("as.textmodel_lss works with textmodel_lss", { # with fitted model - lss <- as.textmodel_lss(lss_test, seed, terms = feat_test, slice = 10) + lss <- as.textmodel_lss(lss_test, seed_test, terms = feat_test, slice = 10) expect_equal(lss$embedding, lss_test$embedding) expect_identical(lss$data, lss_test$data) expect_identical(lss$frequency, lss_test$frequency) expect_identical(lss$concatenator, lss_test$concatenator) expect_error( - as.textmodel_lss(lss_test, seed, slice = 100), + as.textmodel_lss(lss_test, seed_test, slice = 100), "The value of slice must be between 1 and 50" ) expect_error( - as.textmodel_lss(lss_test, seed, slice = 1:100), + as.textmodel_lss(lss_test, seed_test, slice = 1:100), "The length of slice must be between 1 and 50" ) @@ -91,95 +91,59 @@ test_that("as.textmodel_lss works with textmodel_lss", { "foundations" = 0.3, "the" = 0) lss_dummy <- as.textmodel_lss(weight) expect_error( - as.textmodel_lss(lss_dummy, seed), + as.textmodel_lss(lss_dummy, seed_test), "x must be a valid textmodel_lss object" ) }) -test_that("as.textmodel_lss works with textmodel_wordvector", { - - skip_if_not(utils::packageVersion("wordvector") >= "0.6.0") - - # spatial - wdv <- readRDS("../data/word2vec.RDS") - lss <- as.textmodel_lss(wdv, seed) - - expect_equal(lss$beta_type, "similarity") - expect_equal(lss$embedding, t(wdv$values)) - expect_identical(lss$frequency, wdv$frequency) - expect_identical(names(lss$frequency), names(lss$frequency)) - expect_identical(names(lss$beta), names(lss$frequency)) - - expect_error( - as.textmodel_lss(wdv, seed, spatial = FALSE), - "x must be trained with normalize = FALSE" - ) - - # probabilistic - wdv2 <- readRDS("../data/word2vec-prob.RDS") - lss2 <- as.textmodel_lss(wdv2, seed, spatial = FALSE) - - expect_equal(lss2$beta_type, "probability") - expect_true(is.null(lss2$embedding)) - expect_identical(lss2$frequency, wdv2$frequency) - expect_identical(names(lss2$frequency), names(wdv2$frequency)) - expect_identical(names(lss2$beta), names(lss2$frequency)) - - - lss3 <- as.textmodel_lss(wdv2, "good", spatial = FALSE) # single seed - expect_true(is.null(lss3$embedding)) - expect_identical(lss3$frequency, wdv2$frequency) - expect_identical(names(lss3$frequency), names(wdv2$frequency)) - expect_identical(names(lss3$beta), names(lss3$frequency)) -}) test_that("as.textmodel_lss works with vector", { - weight <- c("decision" = 0.1, "instance" = -0.1, - "foundations" = 0.3, "the" = 0) - lss <- as.textmodel_lss(weight) - expect_equal(names(lss), names(LSX:::build_lss())) - pred <- predict(lss, dfmt_test) - expect_equal(names(pred), rownames(dfmt_test)) - expect_equal(rowSums(dfmt_test[,names(lss$beta)]) == 0, - is.na(pred)) + weight <- c("decision" = 0.1, "instance" = -0.1, + "foundations" = 0.3, "the" = 0) + lss <- as.textmodel_lss(weight) + expect_equal(names(lss), names(LSX:::build_lss())) + pred <- predict(lss, dfmt_test) + expect_equal(names(pred), rownames(dfmt_test)) + expect_equal(rowSums(dfmt_test[,names(lss$beta)]) == 0, + is.na(pred)) }) test_that("as.textmodel_lss errors with vector", { - weight <- c("decision" = 0.1, "instance" = -0.1, - "foundations" = 0.3, "the" = 0) - weight_noname <- weight_naname <- weight_na <- weight - names(weight_noname) <- NULL - expect_error(as.textmodel_lss(weight_noname), - "x must have names for features") - names(weight_naname)[1] <- NA - expect_error(as.textmodel_lss(weight_naname), - "x must not have NA in the names") - weight_na[1] <- NA - expect_error(as.textmodel_lss(weight_na), - "x must not have NA") + weight <- c("decision" = 0.1, "instance" = -0.1, + "foundations" = 0.3, "the" = 0) + weight_noname <- weight_naname <- weight_na <- weight + names(weight_noname) <- NULL + expect_error(as.textmodel_lss(weight_noname), + "x must have names for features") + names(weight_naname)[1] <- NA + expect_error(as.textmodel_lss(weight_naname), + "x must not have NA in the names") + weight_na[1] <- NA + expect_error(as.textmodel_lss(weight_na), + "x must not have NA") }) test_that("terms is working", { - skip_on_cran() - - lss <- textmodel_lss(dfmt_test, seed, k = 50) - - # glob pattern - lss1 <- as.textmodel_lss(lss, seed, terms = "poli*") - expect_equal(sum(stringi::stri_startswith_fixed(names(lss1$beta), "poli")), 11) - expect_identical(names(lss1$beta), names(lss1$frequency)) - - # numeric vector - weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10 - names(weight) <- names(lss1$beta) - lss2 <- as.textmodel_lss(lss, seed, terms = weight) - expect_true(all(lss2$beta == lss1$beta * weight)) - expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = -0.1)), - "terms must be positive values without NA") - expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = NA)), - "terms must be positive values without NA") - expect_error(as.textmodel_lss(lss, seed, terms = c(01, 0.2)), - "terms must be named") + skip_on_cran() + + lss <- textmodel_lss(dfmt_test, seed_test, k = 50) + + # glob pattern + lss1 <- as.textmodel_lss(lss, seed_test, terms = "poli*") + expect_equal(sum(stringi::stri_startswith_fixed(names(lss1$beta), "poli")), 11) + expect_identical(names(lss1$beta), names(lss1$frequency)) + + # numeric vector + weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10 + names(weight) <- names(lss1$beta) + lss2 <- as.textmodel_lss(lss, seed_test, terms = weight) + expect_true(all(lss2$beta == lss1$beta * weight)) + expect_error(as.textmodel_lss(lss, seed_test, terms = c("polity" = 0.2, "politic" = -0.1)), + "terms must be positive values without NA") + expect_error(as.textmodel_lss(lss, seed_test, terms = c("polity" = 0.2, "politic" = NA)), + "terms must be positive values without NA") + expect_error(as.textmodel_lss(lss, seed_test, terms = c(01, 0.2)), + "terms must be named") }) diff --git a/tests/testthat/test-as.textmodel2.R b/tests/testthat/test-as.textmodel2.R new file mode 100644 index 00000000..2bbe7299 --- /dev/null +++ b/tests/testthat/test-as.textmodel2.R @@ -0,0 +1,56 @@ + +seed <- as.seedwords(data_dictionary_sentiment) + +test_that("as.textmodel_lss works with textmodel_wordvector", { + + skip_if_not(utils::packageVersion("wordvector") >= "0.6.0") + + # spatial + wdv <- readRDS("../data/word2vec.RDS") + lss <- as.textmodel_lss(wdv, seed) + + expect_equal(lss$beta_type, "similarity") + expect_equal(lss$embedding, t(wdv$values)) + expect_identical(lss$frequency, wdv$frequency) + expect_identical( + names(lss$frequency), + names(lss$frequency) + ) + expect_identical( + names(lss$beta), + names(lss$frequency) + ) + expect_error( + as.textmodel_lss(wdv, seed, spatial = FALSE), + "x must be trained with normalize = FALSE" + ) + + # probabilistic + wdv2 <- readRDS("../data/word2vec-prob.RDS") + lss2 <- as.textmodel_lss(wdv2, seed, spatial = FALSE) + + expect_equal(lss2$beta_type, "probability") + expect_true(is.null(lss2$embedding)) + expect_identical(lss2$frequency, wdv2$frequency) + expect_identical( + names(lss2$frequency), + names(wdv2$frequency) + ) + expect_identical( + names(lss2$beta), + names(lss2$frequency) + ) + + # single seed + lss3 <- as.textmodel_lss(wdv2, "good", spatial = FALSE) + expect_true(is.null(lss3$embedding)) + expect_identical(lss3$frequency, wdv2$frequency) + expect_identical( + names(lss3$frequency), + names(wdv2$frequency) + ) + expect_identical( + names(lss3$beta), + names(lss3$frequency) + ) +}) diff --git a/tests/testthat/test-as.textmodel3.R b/tests/testthat/test-as.textmodel3.R new file mode 100644 index 00000000..0d9e0b1a --- /dev/null +++ b/tests/testthat/test-as.textmodel3.R @@ -0,0 +1,123 @@ + +dov_test <- readRDS("../data/doc2vec.RDS") + +test_that("as.textmodel_lss is working", { + + skip_if_not(utils::packageVersion("wordvector") >= "0.6.0") + + seed <- as.seedwords(data_dictionary_sentiment) + lss <- as.textmodel_lss(dov_test, seed) + + expect_equal( + lss$seeds, + seed + ) + expect_equal( + lss$seeds_weighted, + c(good = 0.5, superior = 0.5, + wrong = -0.5, inferior = -0.5) + ) + expect_equal( + lss$prob_mode, + "mean" + ) + expect_equal( + lss$beta_type, "dummy" + ) + expect_true( + is.null(lss$embedding) + ) + expect_identical(lss$frequency, dov_test$frequency) + expect_identical( + names(lss$frequency), names(dov_test$frequency) + ) + expect_identical( + names(lss$beta), names(dov_test$frequency) + ) + expect_true( + ggplot2::is_ggplot(textplot_terms(lss)) + ) + expect_true( + all(predict(lss) == lss$alpha) + ) + expect_false( + all(predict(lss, min_n = 10) == lss$alpha) + ) + expect_true( + ggplot2::is_ggplot(textplot_terms(lss)) + ) + + # errors + expect_error( + as.textmodel_lss(dov_test, c(1, 2)), + "seeds must be a character or named-numeric vector" + ) + expect_error( + as.textmodel_lss(dov_test, prob_mode = "xxx"), + "'arg' should be one of" + ) + expect_error( + as.textmodel_lss(dov_test, "xxxx"), + "Seed words are not found in x" + ) + + # single seed + lss2 <- as.textmodel_lss(dov_test, "good") + expect_equal( + lss2$prob_mode, "mean" + ) + expect_equal(lss2$seeds, "good") + expect_equal(lss2$seeds_weighted, + c("good" = 1)) + + # glob seeds + seed <- c("america" = 1, "nation*" = 1, "foreign*" = -1) + lss3 <- as.textmodel_lss(dov_test, seed, prob_mode = "mean") + expect_equal( + lss3$seeds, + c("america" = 1, "nation*" = 1, "foreign*" = -1) + ) + expect_equal( + lss3$seeds_weighted, + c("america" = 0.25, "nations" = 0.25, "nation" = 0.25, "national" = 0.25, + "foreign" = -1) + ) + + # prob_mode + seed <- c("government" = 1, "citizen" = -1) + lss4 <- as.textmodel_lss(dov_test, seed, prob_mode = "mean") + lss5 <- as.textmodel_lss(dov_test, seed, prob_mode = "max") + expect_true( + any(predict(lss4) < 0) + ) + expect_true( + any(predict(lss5) < 0) + ) + + # doc2vec has no data + dov_test_nd <- dov_test + dov_test_nd$data <- NULL + lss6 <- as.textmodel_lss(dov_test_nd, seed) + + expect_error( + textplot_terms(lss6), + "x must be trained with include_data = TRUE" + ) + + expect_true( + all(predict(lss6) != predict(lss6, min_n = 1000)) + ) + + # empty document + dov_test_empty <- dov_test + dov_test_empty$ntoken[c(10, 20, 30)] <- 0 + lss7 <- as.textmodel_lss(dov_test_empty, seed) + + expect_equal( + predict(lss7)[c(10, 20, 30)], + c("1789-Washington.10" = NA_real_, + "1789-Washington.20" = NA_real_, + "1797-Adams.3" = NA_real_) + ) + +}) diff --git a/tests/testthat/test-textmodel_lss.R b/tests/testthat/test-textmodel_lss.R index e2942ad0..e4df311e 100644 --- a/tests/testthat/test-textmodel_lss.R +++ b/tests/testthat/test-textmodel_lss.R @@ -10,13 +10,13 @@ feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), dfmt_test <- dfm(toks_test) fcmt_test <- fcm(dfmt_test) -seed <- as.seedwords(data_dictionary_sentiment) -lss_test <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, +seed_test <- as.seedwords(data_dictionary_sentiment) +lss_test <- textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, include_data = TRUE) -lss_test_nd <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, +lss_test_nd <- textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, include_data = FALSE) -lss_test_ss <- textmodel_lss(dfmt_test, seed[1], terms = feat_test, k = 300) -lss_test_fcm <- textmodel_lss(fcmt_test, seed, terms = feat_test, k = 50) +lss_test_ss <- textmodel_lss(dfmt_test, seed_test[1], terms = feat_test, k = 300) +lss_test_fcm <- textmodel_lss(fcmt_test, seed_test, terms = feat_test, k = 50) test_that("char_context is working", { @@ -200,25 +200,25 @@ test_that("terms is working", { skip_on_cran() # glob pattern - lss1 <- textmodel_lss(dfmt_test, seed, terms = "poli*", k = 300) + lss1 <- textmodel_lss(dfmt_test, seed_test, terms = "poli*", k = 300) expect_true(all(stringi::stri_startswith_fixed(names(lss1$beta), "poli"))) # numeric vector weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10 names(weight) <- names(lss1$beta) - lss2 <- textmodel_lss(dfmt_test, seed, terms = weight, k = 300) + lss2 <- textmodel_lss(dfmt_test, seed_test, terms = weight, k = 300) expect_true(all(lss2$beta == lss1$beta * weight)) - expect_error(textmodel_lss(dfmt_test, seed, terms = c("polity" = 0.2, "politic" = -0.1), k = 300), + expect_error(textmodel_lss(dfmt_test, seed_test, terms = c("polity" = 0.2, "politic" = -0.1), k = 300), "terms must be positive values without NA") - expect_error(textmodel_lss(dfmt_test, seed, terms = c("polity" = 0.2, "politic" = NA), k = 300), + expect_error(textmodel_lss(dfmt_test, seed_test, terms = c("polity" = 0.2, "politic" = NA), k = 300), "terms must be positive values without NA") - expect_error(textmodel_lss(dfmt_test, seed, terms = c(01, 0.2), k = 300), + expect_error(textmodel_lss(dfmt_test, seed_test, terms = c(01, 0.2), k = 300), "terms must be named") }) test_that("terms work with numeric vector", { - lss <- textmodel_lss(dfmt_test, seed, terms = "poli*", k = 300) + lss <- textmodel_lss(dfmt_test, seed_test, terms = "poli*", k = 300) expect_true(all(stringi::stri_startswith_fixed(names(coef(lss)), "poli"))) }) @@ -301,7 +301,7 @@ test_that("textmodel_lss works with non-existent seeds", { seed2 <- c("xyz", "xxx") expect_error(textmodel_lss(dfmt_test, seed2, k = 10), - "No seed word is found in the dfm") + "Seed words are not found in x") }) test_that("rsvd and irlba work", { @@ -355,14 +355,14 @@ test_that("weight is working", { test_that("slice argument is working", { expect_identical( - dim(textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 100)$embedding), - dim(textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:100)$embedding) + dim(textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, slice = 100)$embedding), + dim(textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, slice = 1:100)$embedding) ) expect_silent( - textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:100) + textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, slice = 1:100) ) expect_error( - textmodel_lss(dfmt_test, seed, terms = feat_test, k = 300, slice = 1:400), + textmodel_lss(dfmt_test, seed_test, terms = feat_test, k = 300, slice = 1:400), "The length of slice must be between 1 and 300" ) }) @@ -379,12 +379,12 @@ test_that("old argument still works", { skip_on_cran() # takes to much time suppressWarnings({ - lss <- textmodel_lss(dfmt_test, seed, features = feat_test, k = 300) + lss <- textmodel_lss(dfmt_test, seed_test, features = feat_test, k = 300) }) expect_equal(lss_test$terms, lss$terms) suppressWarnings({ - lss_fcm <- textmodel_lss(fcmt_test, seed, features = feat_test, w = 50) + lss_fcm <- textmodel_lss(fcmt_test, seed_test, features = feat_test, w = 50) }) expect_equal(lss_test$terms, lss_fcm$terms) }) @@ -485,11 +485,11 @@ test_that("rescaling still works", { test_that("textmodel_lss print messages", { expect_output( - textmodel_lss(dfmt_test, seed, k = 100, verbose = TRUE), + textmodel_lss(dfmt_test, seed_test, k = 100, verbose = TRUE), "Performing SVD by RSpectra", fixed = TRUE ) expect_warning( - textmodel_lss(dfmt_test, seed, features = feat_test, k = 100), + textmodel_lss(dfmt_test, seed_test, features = feat_test, k = 100), "'features' is deprecated; use 'terms'", fixed = TRUE ) diff --git a/tests/testthat/test-textmodel_lss2.R b/tests/testthat/test-textmodel_lss2.R index d93f3ad0..e21f6a81 100644 --- a/tests/testthat/test-textmodel_lss2.R +++ b/tests/testthat/test-textmodel_lss2.R @@ -7,14 +7,14 @@ require(quanteda) toks_test <- readRDS("../data/tokens.RDS") feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100) -seed <- as.seedwords(data_dictionary_sentiment) +seed_test <- as.seedwords(data_dictionary_sentiment) test_that("textmodel_lss works when spatial = TRUE", { skip_on_cran() # without data - lss1 <- textmodel_lss(toks_test, seed, k = 10) + lss1 <- textmodel_lss(toks_test, seed_test, k = 10) expect_s3_class(lss1, "textmodel_lss") expect_equal(lss1$k, 10) @@ -30,7 +30,7 @@ test_that("textmodel_lss works when spatial = TRUE", { ) # with data - lss2 <- textmodel_lss(toks_test, seed, k = 10, include_data = TRUE) + lss2 <- textmodel_lss(toks_test, seed_test, k = 10, include_data = TRUE) expect_s3_class(lss2, "textmodel_lss") expect_equal(lss2$concatenator, concatenator(toks_test)) @@ -41,7 +41,7 @@ test_that("textmodel_lss works when spatial = TRUE", { ) # with terms - lss3 <- textmodel_lss(toks_test, seed, k = 10, terms = feat_test, + lss3 <- textmodel_lss(toks_test, seed_test, k = 10, terms = feat_test, include_data = TRUE, group_data = TRUE) expect_s3_class(lss3, "textmodel_lss") @@ -52,7 +52,7 @@ test_that("textmodel_lss works when spatial = TRUE", { ) # with tokens_xptr - lss4 <- textmodel_lss(as.tokens_xptr(toks_test), seed, k = 10, + lss4 <- textmodel_lss(as.tokens_xptr(toks_test), seed_test, k = 10, include_data = TRUE) expect_s3_class(lss4, "textmodel_lss") @@ -60,7 +60,7 @@ test_that("textmodel_lss works when spatial = TRUE", { # warning expect_warning( - textmodel_lss(toks_test, seed, k = 10, + textmodel_lss(toks_test, seed_test, k = 10, include_data = FALSE, group_data = TRUE), "group_data is ignored when include_data = FALSE" ) @@ -83,7 +83,7 @@ test_that("textmodel_lss works when spatial = FALSE", { skip_on_cran() # without data - lss1 <- textmodel_lss(toks_test, seed, k = 10, spatial = FALSE) + lss1 <- textmodel_lss(toks_test, seed_test, k = 10, spatial = FALSE) expect_s3_class(lss1, "textmodel_lss") expect_equal(lss1$k, 10) @@ -99,7 +99,7 @@ test_that("textmodel_lss works when spatial = FALSE", { ) # with data - lss2 <- textmodel_lss(toks_test, seed, k = 10, include_data = TRUE, spatial = FALSE) + lss2 <- textmodel_lss(toks_test, seed_test, k = 10, include_data = TRUE, spatial = FALSE) expect_s3_class(lss2, "textmodel_lss") expect_equal(lss2$concatenator, concatenator(toks_test)) @@ -110,7 +110,7 @@ test_that("textmodel_lss works when spatial = FALSE", { ) # with terms - lss3 <- textmodel_lss(toks_test, seed, k = 10, terms = feat_test, + lss3 <- textmodel_lss(toks_test, seed_test, k = 10, terms = feat_test, include_data = TRUE, group_data = TRUE, spatial = FALSE) expect_s3_class(lss3, "textmodel_lss") @@ -121,7 +121,7 @@ test_that("textmodel_lss works when spatial = FALSE", { ) # with tokens_xptr - lss4 <- textmodel_lss(as.tokens_xptr(toks_test), seed, k = 10, + lss4 <- textmodel_lss(as.tokens_xptr(toks_test), seed_test, k = 10, include_data = TRUE, spatial = FALSE) expect_s3_class(lss4, "textmodel_lss") @@ -129,7 +129,7 @@ test_that("textmodel_lss works when spatial = FALSE", { # warning expect_warning( - textmodel_lss(toks_test, seed, k = 10, + textmodel_lss(toks_test, seed_test, k = 10, include_data = FALSE, group_data = TRUE, spatial = FALSE), "group_data is ignored when include_data = FALSE" ) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index b94e35d2..83c3ca9f 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -187,3 +187,22 @@ test_that("smooth_lss works with multiple grouping variables", { "columns for grouping cannot be numeric" ) }) + +test_that("rowMaxs works", { + + mat <- matrix(c( 0.1, 0.1, 0.9, + 0.2, 0.2, -0.1, + -0.1, -0.1, 0.0), nrow = 3, byrow = TRUE) + rownames(mat) <- c("a", "b", "c") + + expect_equal( + rowMaxs(mat), + c("a" = 0.9, "b" = 0.2, "c" = 0.0) + ) + expect_error( + rowMaxs(as.data.frame(mat)), + "is.matrix(x) is not TRUE", + fixed = TRUE + ) + +})