Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c2004d3
Add methods for textmodel_lss2
koheiw Mar 13, 2026
240ac15
Separate lss, lss2, lss3
koheiw Mar 19, 2026
597d31a
Save doc2vec for tests
koheiw Mar 19, 2026
a0a3f09
Set nested_weight = FALSE
koheiw Mar 19, 2026
9c4ed32
Save seeds_weighted
koheiw Mar 19, 2026
0be631d
Update MAN
koheiw Mar 19, 2026
ef6fd0a
Use rowSums for weighted probability
koheiw Mar 26, 2026
b7857b9
Fix predict
koheiw Mar 26, 2026
df4d335
Weight outside of probability()
koheiw Mar 26, 2026
e230213
Add tests
koheiw Mar 26, 2026
e973462
Add tests
koheiw Mar 26, 2026
98bf705
Change seeds_weighted to a list()
koheiw Mar 26, 2026
4d0d622
Add comment
koheiw Mar 26, 2026
5a60e3f
Save concatenator
koheiw Mar 26, 2026
3cb8801
Revert to seeds_weighted to a vector
koheiw Mar 26, 2026
267b8ed
Add function for old wordvector object
koheiw Mar 26, 2026
8d081b0
Simplify
koheiw Mar 26, 2026
2bf351e
Update error message
koheiw Mar 27, 2026
9081042
Separate tests for word2vec
koheiw Mar 27, 2026
75e86bb
Check if wordvector is installed
koheiw Mar 27, 2026
9a8297d
Tidy up tests
koheiw Mar 27, 2026
789656b
Change to prob_mode
koheiw Apr 2, 2026
6fae9d3
Add tests for rowMaxs()
koheiw Apr 2, 2026
4e26c18
Update MAN
koheiw Apr 2, 2026
734c242
Update test object
koheiw Apr 2, 2026
690bac8
Check beta before plotting
koheiw Apr 2, 2026
af81a8a
Fix
koheiw Apr 2, 2026
9bc30be
Build
koheiw Apr 2, 2026
2d11e33
Fix
koheiw Apr 2, 2026
885438c
Add dots
koheiw Apr 2, 2026
004a00f
Fix the handling of negative polarity
koheiw Apr 2, 2026
63373c8
predict() return NA for empty documents
koheiw Apr 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ S3method(as.statistics_textmodel,data.frame)
S3method(as.statistics_textmodel,matrix)
S3method(as.textmodel_lss,matrix)
S3method(as.textmodel_lss,numeric)
S3method(as.textmodel_lss,textmodel_doc2vec)
S3method(as.textmodel_lss,textmodel_lss)
S3method(as.textmodel_lss,textmodel_word2vec)
S3method(as.textmodel_lss,textmodel_wordvector)
S3method(coef,textmodel_lss)
S3method(diagnosys,character)
S3method(diagnosys,corpus)
S3method(predict,textmodel_lss)
S3method(predict,textmodel_lss3)
S3method(print,coefficients_textmodel)
S3method(print,statistics_textmodel)
S3method(print,summary.textmodel)
Expand All @@ -35,6 +38,7 @@ export(coefficients.textmodel_lss)
export(cohesion)
export(diagnosys)
export(optimize_lss)
export(rowMaxs)
export(seedwords)
export(smooth_lss)
export(textmodel_lss)
Expand Down
72 changes: 4 additions & 68 deletions R/as.textmodel.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,15 @@ as.textmodel_lss.matrix <- function(x, seeds,
nested_weight = TRUE,
verbose = FALSE, ...) {

args <- list(terms = terms, seeds = seeds)
if (is.null(colnames(x)))
stop("x must have column names for features")
if (any(is.na(colnames(x))))
stop("x must not have NA in the column names")
if (any(is.na(x)))
stop("x must not have NA")

seeds <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
seed <- unlist(unname(seeds))
s <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
seed <- unlist(unname(s))
theta <- get_theta(terms, colnames(x))

if (is.null(slice)) {
Expand All @@ -57,8 +56,8 @@ as.textmodel_lss.matrix <- function(x, seeds,
beta = beta,
k = nrow(x),
slice = slice,
terms = args$terms,
seeds = args$seeds,
terms = terms,
seeds = seeds,
seeds_weighted = seed,
embedding = x,
similarity = simil$seed,
Expand Down Expand Up @@ -100,66 +99,3 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) {
result$frequency <- x$frequency[names(result$beta)]
return(result)
}

#' @rdname as.textmodel_lss
#' @export
#' @method as.textmodel_lss textmodel_wordvector
as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
terms = NULL,
nested_weight = TRUE,
verbose = FALSE,
spatial = TRUE,
...) {

#args <- list(terms = terms, seeds = seeds)
spatial <- check_logical(spatial)
if (spatial) {

if (x$version == as.numeric_version("0.1.0")) {
values <- x$vector
} else if (x$version >= as.numeric_version("0.6.0")) {
values <- x$values$word
} else {
values <- x$values
}
result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
nested_weight = nested_weight, ...)
result$frequency <- x$frequency[names(result$beta)]
result$type = "word2vec"
result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)

} else {

if (!requireNamespace("wordvector"))
stop("wordvector package must be installed")
if (x$version < as.numeric_version("0.2.0"))
stop("wordvector package must be v0.2.0 or later")

seeds <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
seed <- unlist(unname(seeds))
theta <- get_theta(terms, names(x$frequency))

suppressWarnings({
if (packageVersion("wordvector") >= "0.6.0") {
prob <- wordvector::probability(x, names(seed), mode = "numeric")
} else {
prob <- wordvector::probability(x, names(seed), mode = "values")
}
})
beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta

result <- build_lss(
beta = beta,
beta_type = "probability",
k = x$dim,
terms = terms,
seeds = seeds,
seeds_weighted = seed,
frequency = x$frequency[names(beta)],
type = "word2vec",
spatial = FALSE,
call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
)
}
return(result)
}
72 changes: 72 additions & 0 deletions R/as.textmodel2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

#' @rdname as.textmodel_lss
#' @export
#' @method as.textmodel_lss textmodel_word2vec
as.textmodel_lss.textmodel_word2vec <- function(x, seeds,
terms = NULL,
nested_weight = TRUE,
verbose = FALSE,
spatial = TRUE,
...) {

#args <- list(terms = terms, seeds = seeds)
spatial <- check_logical(spatial)
if (spatial) {

if (x$version == as.numeric_version("0.1.0")) {
values <- x$vector
} else if (x$version >= as.numeric_version("0.6.0")) {
values <- x$values$word
} else {
values <- x$values
}
result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
nested_weight = nested_weight, ...)
result$frequency <- x$frequency[names(result$beta)]
result$concatenator <- x$concatenator
result$type <- "word2vec"
result$call <- try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)

} else {

if (!requireNamespace("wordvector"))
stop("wordvector package must be installed")
if (x$version < as.numeric_version("0.2.0"))
stop("wordvector package must be v0.2.0 or later")

s <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
seed <- unlist(unname(s))
theta <- get_theta(terms, names(x$frequency))

suppressWarnings({
if (packageVersion("wordvector") >= "0.6.0") {
prob <- wordvector::probability(x, names(seed), mode = "numeric")
} else {
prob <- wordvector::probability(x, names(seed), mode = "values")
}
})
beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta

result <- build_lss(
beta = beta,
beta_type = "probability",
k = x$dim,
terms = terms,
seeds = seeds,
seeds_weighted = seed,
frequency = x$frequency[names(beta)],
concatenator = x$concatenator,
type = "word2vec",
spatial = FALSE,
call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
)
}
return(result)
}

#' Support wordvector version 0.5.1 or earlier
#' @export
#' @method as.textmodel_lss textmodel_wordvector
#' @keywords internal
as.textmodel_lss.textmodel_wordvector <- as.textmodel_lss.textmodel_word2vec

76 changes: 76 additions & 0 deletions R/as.textmodel3.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' @noRd
#' @export
#' @keywords internal
#' @param prob_mode select how to compute polarity for documents. See details.
#' @details
#' If `x` is a [wordvector::textmodel_doc2vec] object, it computes
#' polarity scores of documents based on their probabilities for seed words.
#' If `prob_mode = "mean"`, it averages the probabilities, but it keeps used only
#' the highest probability for one of the seed words if `"max"`.
#'
#' @method as.textmodel_lss textmodel_doc2vec
as.textmodel_lss.textmodel_doc2vec <- function(x, seeds, prob_mode = c("mean", "max"),
...) {

prob_mode <- match.arg(prob_mode)

if (!requireNamespace("wordvector"))
stop("wordvector package must be installed")
if (x$version < as.numeric_version("0.6.0"))
stop("wordvector package must be v0.6.0 or later")

s <- expand_seeds(seeds, names(x$frequency), nested_weight = FALSE)
seed <- unlist(unname(s))
prob <- wordvector::probability(x, names(seed), layer = "document", mode = "numeric")

if (prob_mode == "max") {
alpha <- rowMaxs(prob %*% diag(sign(seed)), absolute = TRUE)
} else {
alpha <- rowSums(prob %*% diag(seed))
}

# pseudo beta
if (is.null(x$data)) {
freq <- x$frequency
beta <- NULL
} else {
data <- dfm(x$data, remove_padding = TRUE)
data <- dfm_select(data, names(x$frequency), valuetype = "fixed",
case_insensitive = TRUE)
freq <- colSums(data)
beta <- colSums(data * alpha) / freq
}

result <- build_lss(
seeds = seeds,
seeds_weighted = seed,
beta = beta,
beta_type = "dummy",
terms = names(freq),
frequency = freq,
concatenator = x$concatenator,
type = "doc2vec",
spatial = FALSE,
call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
)
# extra information
result$prob_mode <- prob_mode
result$alpha <- alpha
result$length <- x$ntoken

class(result) <- c("textmodel_lss3", class(result))
return(result)

}

#' @export
#' @keywords internal
#' @method predict textmodel_lss3
predict.textmodel_lss3 <- function(object, min_n = 0L, ...) {
n <- object$length
p <- object$alpha
if (min_n > 0)
p <- p * (n / pmax(n, min_n))
p[n == 0] <- NA_real_
return(p)
}
14 changes: 7 additions & 7 deletions R/textmodel_lss.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,

k <- check_integer(k, min_len = 1, max_len = 1, min = 2, max = nrow(x))
engine <- match.arg(engine)
seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
seed <- unlist(unname(seeds))
s <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
seed <- unlist(unname(s))
theta <- get_theta(terms, featnames(x))
feat <- union(names(theta), names(seed))

Expand Down Expand Up @@ -136,11 +136,11 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
k = k,
slice = slice,
frequency = colSums(x)[names(beta)],
terms = args$terms,
seeds = args$seeds,
terms = terms,
seeds = seeds,
seeds_weighted = seed,
embedding = embed,
similarity = simil$seed,
similarity = simil$seed, # TODO: remove
concatenator = meta(x, field = "concatenator", type = "object"),
type = "svd",
call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE),
Expand Down Expand Up @@ -268,7 +268,7 @@ expand_seeds <- function(seeds, features, nested_weight = TRUE, verbose = FALSE)
seeds_weighted <- weight_seeds(seeds, features, nested_weight)

if (all(lengths(seeds_weighted) == 0))
stop("No seed word is found in the dfm", call. = FALSE)
stop("Seed words are not found in x", call. = FALSE)

return(seeds_weighted)
}
Expand Down Expand Up @@ -386,7 +386,7 @@ get_seeds <- function(seeds) {
seeds <- structure(rep(1, length(seeds)), names = seeds)

if (is.null(names(seeds)))
stop("y must be a named-numerid vector\n", call. = FALSE)
stop("seeds must be a character or named-numeric vector\n", call. = FALSE)

return(seeds)
}
Expand Down
3 changes: 3 additions & 0 deletions R/textplot.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ textplot_terms.textmodel_lss <- function(x, highlighted = NULL,
max_highlighted <- check_integer(max_highlighted, min = 0)
sampling <- match.arg(sampling)

if (x$type == "doc2vec" && is.null(x$beta))
stop("x must be trained with include_data = TRUE to compute pseudo word polarity")

x$frequency <- x$frequency[names(x$beta)] # fix for < v1.1.4
x$frequency[is.na(x$frequency)] <- 0

Expand Down
13 changes: 13 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,16 @@ print.textmodel_lss <- function(x, ...) {
print(x$call)
cat("\n")
}

#' Maximum values in each row of a matrix
#' @param x a matrix object.
#' @param absolute if `TRUE`, return largest positive or negative values.
#' @export
#' @keywords internal
rowMaxs <- function(x, absolute = FALSE) {
stopifnot(is.matrix(x))
y <- if (absolute) abs(x) else x
structure(x[cbind(seq_len(nrow(x)), max.col(y, "first"))],
names = rownames(x))
}

6 changes: 3 additions & 3 deletions man/as.textmodel_lss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/as.textmodel_lss.textmodel_wordvector.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading