koheiw · koheiw · Apr 4, 2026 · Mar 13, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,12 +7,15 @@ S3method(as.statistics_textmodel,data.frame)
 S3method(as.statistics_textmodel,matrix)
 S3method(as.textmodel_lss,matrix)
 S3method(as.textmodel_lss,numeric)
+S3method(as.textmodel_lss,textmodel_doc2vec)
 S3method(as.textmodel_lss,textmodel_lss)
+S3method(as.textmodel_lss,textmodel_word2vec)
 S3method(as.textmodel_lss,textmodel_wordvector)
 S3method(coef,textmodel_lss)
 S3method(diagnosys,character)
 S3method(diagnosys,corpus)
 S3method(predict,textmodel_lss)
+S3method(predict,textmodel_lss3)
 S3method(print,coefficients_textmodel)
 S3method(print,statistics_textmodel)
 S3method(print,summary.textmodel)
@@ -35,6 +38,7 @@ export(coefficients.textmodel_lss)
 export(cohesion)
 export(diagnosys)
 export(optimize_lss)
+export(rowMaxs)
 export(seedwords)
 export(smooth_lss)
 export(textmodel_lss)

diff --git a/R/as.textmodel.R b/R/as.textmodel.R
@@ -30,16 +30,15 @@ as.textmodel_lss.matrix <- function(x, seeds,
                                     nested_weight = TRUE,
                                     verbose = FALSE, ...) {
 
-  args <- list(terms = terms, seeds = seeds)
   if (is.null(colnames(x)))
     stop("x must have column names for features")
   if (any(is.na(colnames(x))))
     stop("x must not have NA in the column names")
   if (any(is.na(x)))
     stop("x must not have NA")
 
-  seeds <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
-  seed <- unlist(unname(seeds))
+  s <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
+  seed <- unlist(unname(s))
   theta <- get_theta(terms, colnames(x))
 
   if (is.null(slice)) {
@@ -57,8 +56,8 @@ as.textmodel_lss.matrix <- function(x, seeds,
     beta = beta,
     k = nrow(x),
     slice = slice,
-    terms = args$terms,
-    seeds = args$seeds,
+    terms = terms,
+    seeds = seeds,
     seeds_weighted = seed,
     embedding = x,
     similarity = simil$seed,
@@ -100,66 +99,3 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) {
   result$frequency <- x$frequency[names(result$beta)]
   return(result)
 }
-
-#' @rdname as.textmodel_lss
-#' @export
-#' @method as.textmodel_lss textmodel_wordvector
-as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
-                                                  terms = NULL,
-                                                  nested_weight = TRUE,
-                                                  verbose = FALSE,
-                                                  spatial = TRUE,
-                                                  ...) {
-
-  #args <- list(terms = terms, seeds = seeds)
-  spatial <- check_logical(spatial)
-  if (spatial) {
-
-    if (x$version == as.numeric_version("0.1.0")) {
-      values <- x$vector
-    } else if (x$version >= as.numeric_version("0.6.0")) {
-      values <- x$values$word
-    } else {
-      values <- x$values
-    }
-    result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
-                               nested_weight = nested_weight, ...)
-    result$frequency <- x$frequency[names(result$beta)]
-    result$type = "word2vec"
-    result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
-
-  } else {
-
-    if (!requireNamespace("wordvector"))
-      stop("wordvector package must be installed")
-    if (x$version < as.numeric_version("0.2.0"))
-      stop("wordvector package must be v0.2.0 or later")
-
-    seeds <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
-    seed <- unlist(unname(seeds))
-    theta <- get_theta(terms, names(x$frequency))
-
-    suppressWarnings({
-      if (packageVersion("wordvector") >= "0.6.0") {
-        prob <- wordvector::probability(x, names(seed), mode = "numeric")
-      } else {
-        prob <- wordvector::probability(x, names(seed), mode = "values")
-      }
-    })
-    beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta
-
-    result <- build_lss(
-      beta = beta,
-      beta_type = "probability",
-      k = x$dim,
-      terms = terms,
-      seeds = seeds,
-      seeds_weighted = seed,
-      frequency = x$frequency[names(beta)],
-      type = "word2vec",
-      spatial = FALSE,
-      call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
-    )
-  }
-  return(result)
-}
diff --git a/R/as.textmodel2.R b/R/as.textmodel2.R
@@ -0,0 +1,72 @@
+
+#' @rdname as.textmodel_lss
+#' @export
+#' @method as.textmodel_lss textmodel_word2vec
+as.textmodel_lss.textmodel_word2vec <- function(x, seeds,
+                                                  terms = NULL,
+                                                  nested_weight = TRUE,
+                                                  verbose = FALSE,
+                                                  spatial = TRUE,
+                                                  ...) {
+
+  #args <- list(terms = terms, seeds = seeds)
+  spatial <- check_logical(spatial)
+  if (spatial) {
+
+    if (x$version == as.numeric_version("0.1.0")) {
+      values <- x$vector
+    } else if (x$version >= as.numeric_version("0.6.0")) {
+      values <- x$values$word
+    } else {
+      values <- x$values
+    }
+    result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
+                               nested_weight = nested_weight, ...)
+    result$frequency <- x$frequency[names(result$beta)]
+    result$concatenator <- x$concatenator
+    result$type <- "word2vec"
+    result$call <- try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
+
+  } else {
+
+    if (!requireNamespace("wordvector"))
+      stop("wordvector package must be installed")
+    if (x$version < as.numeric_version("0.2.0"))
+      stop("wordvector package must be v0.2.0 or later")
+
+    s <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
+    seed <- unlist(unname(s))
+    theta <- get_theta(terms, names(x$frequency))
+
+    suppressWarnings({
+      if (packageVersion("wordvector") >= "0.6.0") {
+        prob <- wordvector::probability(x, names(seed), mode = "numeric")
+      } else {
+        prob <- wordvector::probability(x, names(seed), mode = "values")
+      }
+    })
+    beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta
+
+    result <- build_lss(
+      beta = beta,
+      beta_type = "probability",
+      k = x$dim,
+      terms = terms,
+      seeds = seeds,
+      seeds_weighted = seed,
+      frequency = x$frequency[names(beta)],
+      concatenator = x$concatenator,
+      type = "word2vec",
+      spatial = FALSE,
+      call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
+    )
+  }
+  return(result)
+}
+
+#' Support wordvector version 0.5.1 or earlier
+#' @export
+#' @method as.textmodel_lss textmodel_wordvector
+#' @keywords internal
+as.textmodel_lss.textmodel_wordvector <- as.textmodel_lss.textmodel_word2vec
+
diff --git a/R/as.textmodel3.R b/R/as.textmodel3.R
@@ -0,0 +1,76 @@
+#' @noRd
+#' @export
+#' @keywords internal
+#' @param prob_mode select how to compute polarity for documents. See details.
+#' @details
+#' If `x` is a [wordvector::textmodel_doc2vec] object, it computes
+#' polarity scores of documents based on their probabilities for seed words.
+#' If `prob_mode = "mean"`, it averages the probabilities, but it keeps used only
+#' the highest probability for one of the seed words if `"max"`.
+#'
+#' @method as.textmodel_lss textmodel_doc2vec
+as.textmodel_lss.textmodel_doc2vec <- function(x, seeds, prob_mode = c("mean", "max"),
+                                               ...) {
+
+  prob_mode <- match.arg(prob_mode)
+
+  if (!requireNamespace("wordvector"))
+    stop("wordvector package must be installed")
+  if (x$version < as.numeric_version("0.6.0"))
+    stop("wordvector package must be v0.6.0 or later")
+
+  s <- expand_seeds(seeds, names(x$frequency), nested_weight = FALSE)
+  seed <- unlist(unname(s))
+  prob <- wordvector::probability(x, names(seed), layer = "document", mode = "numeric")
+
+  if (prob_mode == "max") {
+    alpha <- rowMaxs(prob %*% diag(sign(seed)), absolute = TRUE)
+  } else {
+    alpha <- rowSums(prob %*% diag(seed))
+  }
+
+  # pseudo beta
+  if (is.null(x$data)) {
+    freq <- x$frequency
+    beta <- NULL
+  } else {
+    data <- dfm(x$data, remove_padding = TRUE)
+    data <- dfm_select(data, names(x$frequency), valuetype = "fixed",
+                       case_insensitive = TRUE)
+    freq <- colSums(data)
+    beta <- colSums(data * alpha) / freq
+  }
+
+  result <- build_lss(
+    seeds = seeds,
+    seeds_weighted = seed,
+    beta = beta,
+    beta_type = "dummy",
+    terms = names(freq),
+    frequency = freq,
+    concatenator = x$concatenator,
+    type = "doc2vec",
+    spatial = FALSE,
+    call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
+  )
+  # extra information
+  result$prob_mode <- prob_mode
+  result$alpha <- alpha
+  result$length <- x$ntoken
+
+  class(result) <- c("textmodel_lss3", class(result))
+  return(result)
+
+}
+
+#' @export
+#' @keywords internal
+#' @method predict textmodel_lss3
+predict.textmodel_lss3 <- function(object, min_n = 0L, ...) {
+  n <- object$length
+  p <- object$alpha
+  if (min_n > 0)
+    p <- p * (n / pmax(n, min_n))
+  p[n == 0] <- NA_real_
+  return(p)
+}
diff --git a/R/textmodel_lss.R b/R/textmodel_lss.R
@@ -106,8 +106,8 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
 
     k <- check_integer(k, min_len = 1, max_len = 1, min = 2, max = nrow(x))
     engine <- match.arg(engine)
-    seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
-    seed <- unlist(unname(seeds))
+    s <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
+    seed <- unlist(unname(s))
     theta <- get_theta(terms, featnames(x))
     feat <- union(names(theta), names(seed))
 
@@ -136,11 +136,11 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
         k = k,
         slice = slice,
         frequency = colSums(x)[names(beta)],
-        terms = args$terms,
-        seeds = args$seeds,
+        terms = terms,
+        seeds = seeds,
         seeds_weighted = seed,
         embedding = embed,
-        similarity = simil$seed,
+        similarity = simil$seed, # TODO: remove
         concatenator = meta(x, field = "concatenator", type = "object"),
         type = "svd",
         call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE),
@@ -268,7 +268,7 @@ expand_seeds <- function(seeds, features, nested_weight = TRUE, verbose = FALSE)
     seeds_weighted <- weight_seeds(seeds, features, nested_weight)
 
     if (all(lengths(seeds_weighted) == 0))
-        stop("No seed word is found in the dfm", call. = FALSE)
+        stop("Seed words are not found in x", call. = FALSE)
 
     return(seeds_weighted)
 }
@@ -386,7 +386,7 @@ get_seeds <- function(seeds) {
         seeds <- structure(rep(1, length(seeds)), names = seeds)
 
     if (is.null(names(seeds)))
-        stop("y must be a named-numerid vector\n", call. = FALSE)
+        stop("seeds must be a character or named-numeric vector\n", call. = FALSE)
 
     return(seeds)
 }

diff --git a/R/textplot.R b/R/textplot.R
@@ -70,6 +70,9 @@ textplot_terms.textmodel_lss <- function(x, highlighted = NULL,
     max_highlighted <- check_integer(max_highlighted, min = 0)
     sampling <- match.arg(sampling)
 
+    if (x$type == "doc2vec" && is.null(x$beta))
+      stop("x must be trained with include_data = TRUE to compute pseudo word polarity")
+
     x$frequency <- x$frequency[names(x$beta)] # fix for < v1.1.4
     x$frequency[is.na(x$frequency)] <- 0
 

diff --git a/R/utils.R b/R/utils.R
@@ -230,3 +230,16 @@ print.textmodel_lss <- function(x, ...) {
   print(x$call)
   cat("\n")
 }
+
+#' Maximum values in each row of a matrix
+#' @param x a matrix object.
+#' @param absolute if `TRUE`, return largest positive or negative values.
+#' @export
+#' @keywords internal
+rowMaxs <- function(x, absolute = FALSE) {
+  stopifnot(is.matrix(x))
+  y <- if (absolute) abs(x) else x
+  structure(x[cbind(seq_len(nrow(x)), max.col(y, "first"))],
+            names = rownames(x))
+}
+
diff --git a/man/as.textmodel_lss.Rd b/man/as.textmodel_lss.Rd
diff --git a/man/as.textmodel_lss.textmodel_wordvector.Rd b/man/as.textmodel_lss.textmodel_wordvector.Rd