From c75ce35ace303dd1a8df507a44cfda3e70f3fb4d Mon Sep 17 00:00:00 2001 From: Katarzyna Wreczycka Date: Wed, 13 Jun 2018 16:51:58 +0200 Subject: [PATCH 01/13] added estimate.params.density param to methSeg --- R/methSeg.R | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index dfbc240c..b2d411f5 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -23,6 +23,13 @@ #' @param join.neighbours if TRUE neighbouring segments that cluster to the same #' seg.group are joined by extending the ranges, summing up num.marks and #' averaging over seg.means. +#' @param estimate.params.density a numeric value indicating percentage of regions +#' to sample and then estimate initial parameters (G and modelName) for a +#' function to calculate density (\code{\link[mclust]{densityMclust}}). +#' The value can be between 0 and 1, e.g. 0.1 means that 10% of data will +#' be used for sampling, otherwise it uses the whole dataset (Default: 1) +#' and it's time consuming on large datasets. If 0 or 1 then the function +#' will be executed without sampling. #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg #' package, or to \code{\link[mclust]{densityMclust}} #' in Mclust package, could be used to fine tune the segmentation algorithm. @@ -62,14 +69,15 @@ #' #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) #' -#' @author Altuna Akalin, contributions by Arsene Wabo +#' @author Altuna Akalin, contributions by Arsene Waboand Katarzyna Wreczycka #' #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} #' #' @export #' @docType methods #' @rdname methSeg -methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, ...){ +methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, + estimate.params.density=1, ...){ dots <- list(...) @@ -133,10 +141,25 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, ...){ diagnostic.plot = FALSE } - # decide on number of components/groups - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - dens=do.call("densityFind", args.Mclust ) + if(estimate.params.density>0 & estimate.params.density<1 ){ + + nbr.sample = floor(length(seg.res) * estimate.params.density) + # estimate parameters for mclust + args.Mclust.esti = args.Mclust + args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ] + args.Mclust.esti[["diagnostic.plot"]]=FALSE + dens_estimate=do.call("densityFind", args.Mclust.esti) + + args.Mclust[["modelName"]] = dens_estimate$modelName + args.Mclust[["G"]] = 1:dens_estimate$G + } + + if(!join.neighbours) { + # decide on number of components/groups + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + dens=do.call("densityFind", args.Mclust ) + } # add components/group ids mcols(seg.res)$seg.group=as.character(dens$classification) From 5378e6162695559f5ce2daecffd56b6576e0cbce Mon Sep 17 00:00:00 2001 From: Kasia Wreczycka Date: Thu, 14 Jun 2018 12:48:21 +0200 Subject: [PATCH 02/13] remove if clause --- R/methSeg.R | 64 +++++++++++++++++++++++++------------------------- man/methSeg.Rd | 11 ++++++++- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index b2d411f5..1a05daaf 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -52,7 +52,7 @@ #' @examples #' #' \donttest{ -#' download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds", +#' download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds", #' destfile="H1.chr21.chr22.rds",method="curl") #' #' mbw=readRDS("H1.chr21.chr22.rds") @@ -69,7 +69,7 @@ #' #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) #' -#' @author Altuna Akalin, contributions by Arsene Waboand Katarzyna Wreczycka +#' @author Altuna Akalin, contributions by Arsene Wabo #' #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} #' @@ -84,15 +84,15 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, ##coerce object to granges if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { - obj= as(obj,"GRanges") - ## calculate methylation score - mcols(obj)$meth=100*obj$numCs/obj$coverage - ## select only required mcol - obj = obj[,"meth"] + obj= as(obj,"GRanges") + ## calculate methylation score + mcols(obj)$meth=100*obj$numCs/obj$coverage + ## select only required mcol + obj = obj[,"meth"] }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") { - obj = as(obj,"GRanges") - ## use methylation difference as score - obj = obj[,"meth.diff"] + obj = as(obj,"GRanges") + ## use methylation difference as score + obj = obj[,"meth.diff"] }else if (class(obj) != "GRanges"){ stop("only methylRaw or methylDiff objects ", "or GRanges objects can be used in this function") @@ -149,34 +149,34 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ] args.Mclust.esti[["diagnostic.plot"]]=FALSE dens_estimate=do.call("densityFind", args.Mclust.esti) - + args.Mclust[["modelName"]] = dens_estimate$modelName args.Mclust[["G"]] = 1:dens_estimate$G - } + } + + + # decide on number of components/groups + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + dens=do.call("densityFind", args.Mclust ) - if(!join.neighbours) { - # decide on number of components/groups - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - dens=do.call("densityFind", args.Mclust ) - } # add components/group ids mcols(seg.res)$seg.group=as.character(dens$classification) # if joining, show clustering after joining if(join.neighbours) { - message("joining neighbouring segments and repeating clustering.") - seg.res <- joinSegmentNeighbours(seg.res) - diagnostic.plot <- diagnostic.plot.old - - # get the new density - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - # skip second progress bar - args.Mclust[["verbose"]]=FALSE - dens=do.call("densityFind", args.Mclust ) - + message("joining neighbouring segments and repeating clustering.") + seg.res <- joinSegmentNeighbours(seg.res) + diagnostic.plot <- diagnostic.plot.old + + # get the new density + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + # skip second progress bar + args.Mclust[["verbose"]]=FALSE + dens=do.call("densityFind", args.Mclust ) + } seg.res @@ -255,9 +255,9 @@ diagPlot<-function(dens,score.gr){ #' @docType methods #' @rdname methSeg2bed methSeg2bed<-function(segments,filename, -trackLine="track name='meth segments' description='meth segments' itemRgb=On", -colramp=colorRamp(c("gray","green", "darkgreen")) - ){ + trackLine="track name='meth segments' description='meth segments' itemRgb=On", + colramp=colorRamp(c("gray","green", "darkgreen")) +){ if(class(segments)!="GRanges"){ stop("segments object has to be of class GRanges") } diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 935e9ff4..6e764441 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -5,7 +5,8 @@ \alias{methSeg} \title{Segment methylation or differential methylation profile} \usage{ -methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, ...) +methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, + estimate.params.density = 1, ...) } \arguments{ \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, @@ -25,6 +26,14 @@ in mixture modeling.} seg.group are joined by extending the ranges, summing up num.marks and averaging over seg.means.} +\item{estimate.params.density}{a numeric value indicating percentage of regions +to sample and then estimate initial parameters (G and modelName) for a +function to calculate density (\code{\link[mclust]{densityMclust}}). +The value can be between 0 and 1, e.g. 0.1 means that 10% of data will +be used for sampling, otherwise it uses the whole dataset (Default: 1) +and it's time consuming on large datasets. If 0 or 1 then the function +will be executed without sampling.} + \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg package, or to \code{\link[mclust]{densityMclust}} in Mclust package, could be used to fine tune the segmentation algorithm. From eb85d019069df15d0d5433c8ac5f7af718c6d407 Mon Sep 17 00:00:00 2001 From: Kasia Wreczycka Date: Thu, 14 Jun 2018 13:17:50 +0200 Subject: [PATCH 03/13] fix previous commit --- R/methSeg.R | 3 ++- man/methSeg.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index 1a05daaf..f151c558 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -69,7 +69,8 @@ #' #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) #' -#' @author Altuna Akalin, contributions by Arsene Wabo +#' @author Altuna Akalin, contributions by Arsene Wabo and Katarzyna +#' Wreczycka #' #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} #' diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 6e764441..5567ff73 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -87,5 +87,6 @@ unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE)) \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} } \author{ -Altuna Akalin, contributions by Arsene Wabo +Altuna Akalin, contributions by Arsene Wabo and Katarzyna +Wreczycka } From a51c93ebc2fd1bf7d2474e0721c1ac9365c56bf0 Mon Sep 17 00:00:00 2001 From: alexg9010 Date: Thu, 14 Jun 2018 18:06:17 +0200 Subject: [PATCH 04/13] Updates to @katwre pull request: - changed parameter name from `estimate.params.density` to `initialize.on.subset` - allow for values higher than 1, which directly yields number of samples - priorize Mclust argument `initialization` - check for to few samples, 9 seems to be the magic number - add tests --- R/methSeg.R | 50 ++++++++++++++++++++++----------- man/methSeg.Rd | 16 +++++------ tests/testthat/test-8-methSeg.r | 6 ++++ 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index f151c558..94f0d128 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -23,13 +23,13 @@ #' @param join.neighbours if TRUE neighbouring segments that cluster to the same #' seg.group are joined by extending the ranges, summing up num.marks and #' averaging over seg.means. -#' @param estimate.params.density a numeric value indicating percentage of regions -#' to sample and then estimate initial parameters (G and modelName) for a -#' function to calculate density (\code{\link[mclust]{densityMclust}}). -#' The value can be between 0 and 1, e.g. 0.1 means that 10% of data will -#' be used for sampling, otherwise it uses the whole dataset (Default: 1) -#' and it's time consuming on large datasets. If 0 or 1 then the function -#' will be executed without sampling. +#' @param initialize.on.subset a numeric value indicating either percentage or +#' absolute value of regions to subsample from segments before performing +#' the mixture modeling. The value can be either between 0 and 1, +#' e.g. 0.1 means that 10% of data will be used for sampling, or be an +#' integer higher than 1 to define the number of regions to sample. +#' By default uses the whole dataset, which can be time consuming on +#' large datasets. (Default: 1) #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg #' package, or to \code{\link[mclust]{densityMclust}} #' in Mclust package, could be used to fine tune the segmentation algorithm. @@ -78,7 +78,7 @@ #' @docType methods #' @rdname methSeg methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, - estimate.params.density=1, ...){ + initialize.on.subset=1, ...){ dots <- list(...) @@ -142,17 +142,33 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, diagnostic.plot = FALSE } - if(estimate.params.density>0 & estimate.params.density<1 ){ + if("initialization" %in% names(args.Mclust)){ + if("subset" %in% names(args.Mclust[["initialization"]])) { + if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){ + stop("too few samples, increase the size of subset.") + } + message(paste("initializing clustering with", + length(args.Mclust[["initialization"]][["subset"]]), + "segments.")) + initialize.on.subset = 1 + } + } + + if(initialize.on.subset != 1 && initialize.on.subset > 0 ) { - nbr.sample = floor(length(seg.res) * estimate.params.density) - # estimate parameters for mclust - args.Mclust.esti = args.Mclust - args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ] - args.Mclust.esti[["diagnostic.plot"]]=FALSE - dens_estimate=do.call("densityFind", args.Mclust.esti) + if( initialize.on.subset > 0 & initialize.on.subset < 1 ) + nbr.sample = floor(length(seg.res) * initialize.on.subset) - args.Mclust[["modelName"]] = dens_estimate$modelName - args.Mclust[["G"]] = 1:dens_estimate$G + if( initialize.on.subset > 1) + nbr.sample = initialize.on.subset + + if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") } + + message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments.")) + # estimate parameters for mclust + sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE) + if(length(sub) < 9 ){stop("too few samples, increase the size of subset.") } + args.Mclust[["initialization"]]=list(subset = sub) } diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 5567ff73..6f1a2765 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -6,7 +6,7 @@ \title{Segment methylation or differential methylation profile} \usage{ methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, - estimate.params.density = 1, ...) + initialize.on.subset = 1, ...) } \arguments{ \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, @@ -26,13 +26,13 @@ in mixture modeling.} seg.group are joined by extending the ranges, summing up num.marks and averaging over seg.means.} -\item{estimate.params.density}{a numeric value indicating percentage of regions -to sample and then estimate initial parameters (G and modelName) for a -function to calculate density (\code{\link[mclust]{densityMclust}}). -The value can be between 0 and 1, e.g. 0.1 means that 10% of data will -be used for sampling, otherwise it uses the whole dataset (Default: 1) -and it's time consuming on large datasets. If 0 or 1 then the function -will be executed without sampling.} +\item{initialize.on.subset}{a numeric value indicating either percentage or +absolute value of regions to subsample from segments before performing +the mixture modeling. The value can be either between 0 and 1, +e.g. 0.1 means that 10% of data will be used for sampling, or be an +integer higher than 1 to define the number of regions to sample. +By default uses the whole dataset, which can be time consuming on +large datasets. (Default: 1)} \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg package, or to \code{\link[mclust]{densityMclust}} diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r index 72a8edb5..36705501 100644 --- a/tests/testthat/test-8-methSeg.r +++ b/tests/testthat/test-8-methSeg.r @@ -55,6 +55,12 @@ test_that("check if joining neighbours works" ,{ expect_true(all(rle(res2$seg.group)$lengths == 1)) }) +test_that("check if initialization works" ,{ + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE),"GRanges") + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialization=list(subset = sample(1:13,9,replace = FALSE))),"GRanges") + expect_error(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 0.6)) + expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 10),"GRanges") +}) seg <- methSeg(tileRaw,diagnostic.plot = FALSE) methSeg2bed(segments = seg, filename = "test.bed") From d3970a8ea75230fd645e72c4c8fc0087d0f4cfa7 Mon Sep 17 00:00:00 2001 From: Alexander Gosdschan Date: Sat, 16 Jun 2018 11:29:16 +0200 Subject: [PATCH 05/13] Update methSeg.R Remove duplicate check for sample size. --- R/methSeg.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/methSeg.R b/R/methSeg.R index 94f0d128..0bdb42eb 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -167,7 +167,6 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments.")) # estimate parameters for mclust sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE) - if(length(sub) < 9 ){stop("too few samples, increase the size of subset.") } args.Mclust[["initialization"]]=list(subset = sub) } From b5df9d04a435c7f5167a7f14a11da4c2181d693d Mon Sep 17 00:00:00 2001 From: alexg9010 Date: Mon, 18 Jun 2018 10:35:15 +0200 Subject: [PATCH 06/13] update documentation --- R/methSeg.R | 10 ++++++++++ man/methSeg.Rd | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/R/methSeg.R b/R/methSeg.R index 0bdb42eb..7dae17ff 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -49,6 +49,16 @@ #' @details #' To be sure that the algorithm will work on your data, #' the object should have at least 5000 records +#' +#' @details After initial segmentation with fastseg(), segments are clustered +#' into self-similar groups based on their mean methylation profile +#' using mixture modeling. Mixture modeling estimates the initial +#' parameters of the distributions by using the whole dataset. +#' If "initialize.on.subset" argument set as described, the function +#' will use a subset of the data to initialize the parameters of the +#' mixture modeling prior to the Expectation-maximization (EM) algorithm +#' used by \code{\link{Mclust}} package. +#' #' @examples #' #' \donttest{ diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 6f1a2765..83b615a9 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -61,6 +61,15 @@ as high or low methylated regions. \details{ To be sure that the algorithm will work on your data, the object should have at least 5000 records + +After initial segmentation with fastseg(), segments are clustered + into self-similar groups based on their mean methylation profile + using mixture modeling. Mixture modeling estimates the initial + parameters of the distributions by using the whole dataset. + If "initialize.on.subset" argument set as described, the function + will use a subset of the data to initialize the parameters of the + mixture modeling prior to the Expectation-maximization (EM) algorithm + used by \code{\link{Mclust}} package. } \examples{ From 75b949f1914bf0a7d9a0dc6cdf085e2e5079b945 Mon Sep 17 00:00:00 2001 From: alexg9010 Date: Mon, 18 Jun 2018 10:54:16 +0200 Subject: [PATCH 07/13] update NEWS and version bump --- DESCRIPTION | 5 +++-- NEWS | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e754ae73..593a9da4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,14 +2,15 @@ Package: methylKit Type: Package Title: DNA methylation analysis from high-throughput bisulfite sequencing results -Version: 1.7.4 +Version: 1.7.5 Authors@R: c(person("Altuna", "Akalin", role = c("aut", "cre"), email = "aakalin@gmail.com"), person("Matthias","Kormaksson", role = "aut"), person("Sheng","Li", role = "aut"), person("Arsene", "Wabo", role = "ctb"), person("Adrian", "Bierling", role= "aut"), - person("Alexander","Gosdschan", role="aut")) + person("Alexander","Gosdschan", role="aut"), + person("Katarzyna","Wreczycka", role="ctb")) Author: Altuna Akalin [aut, cre], Matthias Kormaksson [aut], Sheng Li [aut], Arsene Wabo [ctb], Adrian Bierling [aut], Alexander Gosdschan [aut] diff --git a/NEWS b/NEWS index 079f646f..f363e11c 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,16 @@ +methylKit 1.7.5 +-------------- +NEW FUNCTIONS AND FEATURES +* methSeg(): introduce parameter `initialize.on.subset` to subset data + for initialization of mixture modeling; update description; add tests + + methylKit 1.7.4 -------------- IMPROVEMENTS AND BUG FIXES * update link to test-file for methSeg() function + methylKit 1.7.3 -------------- IMPROVEMENTS AND BUG FIXES From 4395aafb4e4632d86a78e7f4078d41d707008b6f Mon Sep 17 00:00:00 2001 From: Kasia Wreczycka Date: Thu, 28 Jun 2018 18:14:04 +0200 Subject: [PATCH 08/13] Added cores to methSeg --- DESCRIPTION | 2 +- NEWS | 7 + R/methSeg.R | 316 +++++++++++++++++++++----------- R/tabix.functions.R | 24 ++- tests/testthat/test-8-methSeg.r | 21 +++ 5 files changed, 255 insertions(+), 115 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 593a9da4..02642479 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: methylKit Type: Package Title: DNA methylation analysis from high-throughput bisulfite sequencing results -Version: 1.7.5 +Version: 1.7.6 Authors@R: c(person("Altuna", "Akalin", role = c("aut", "cre"), email = "aakalin@gmail.com"), person("Matthias","Kormaksson", role = "aut"), diff --git a/NEWS b/NEWS index f363e11c..b0aecf4b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +methylKit 1.7.6 +-------------- +NEW FUNCTIONS AND FEATURES +* methSeg(): introduce parameter `cores` to run fastseg in parallel + per chromosome; update description; add tests + + methylKit 1.7.5 -------------- NEW FUNCTIONS AND FEATURES diff --git a/R/methSeg.R b/R/methSeg.R index 7dae17ff..99702204 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -1,5 +1,110 @@ # segmentation functions +.run.fastseg = function(obj, ...){ + + dots <- list(...) + + ## check wether obj contains at least one metacol + if(ncol(elementMetadata(obj))<1) + stop("GRanges does not have any meta column.") + + ## check wether obj contains is sorted by position + if(is.unsorted(obj,ignore.strand=TRUE)) { + obj <- sort(obj,ignore.strand=TRUE) + message("Object not sorted by position, sorting now.") + } + + ## check wether obj contains at least two ranges else stop + if(length(obj)<=1) + stop("segmentation requires at least two ranges.") + + # match argument names to fastseg arguments + args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ] + + args.fastseg[["x"]]=obj + + # do the segmentation + #seg.res=fastseg(obj) + seg.res <- do.call("fastseg", args.fastseg) + + # stop if segmentation produced only one range + if(length(seg.res)==1) { + warning("segmentation produced only one range, no mixture modeling possible.") + seg.res$seg.group <- "1" + return(seg.res) + } + return(seg.res) +} + + +.run.mclust = function(seg.res, diagnostic.plot=TRUE, join.neighbours=FALSE, + initialize.on.subset=1, ...){ + + dots <- list(...) + + # match argument names to Mclust + args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1]) ] + + # if joining, do not show first clustering + if(join.neighbours) { + diagnostic.plot.old = diagnostic.plot + diagnostic.plot = FALSE + } + + if("initialization" %in% names(args.Mclust)){ + if("subset" %in% names(args.Mclust[["initialization"]])) { + if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){ + stop("too few samples, increase the size of subset.") + } + message(paste("initializing clustering with", + length(args.Mclust[["initialization"]][["subset"]]), + "segments.")) + initialize.on.subset = 1 + } + } + + if(initialize.on.subset != 1 && initialize.on.subset > 0 ) { + + if( initialize.on.subset > 0 & initialize.on.subset < 1 ) + nbr.sample = floor(length(seg.res) * initialize.on.subset) + + if( initialize.on.subset > 1) + nbr.sample = initialize.on.subset + + if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") } + + message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments.")) + # estimate parameters for mclust + sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE) + args.Mclust[["initialization"]]=list(subset = sub) + } + + # decide on number of components/groups + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + dens=do.call("densityFind", args.Mclust ) + + # add components/group ids + mcols(seg.res)$seg.group=as.character(dens$classification) + + # if joining, show clustering after joining + if(join.neighbours) { + message("joining neighbouring segments and repeating clustering.") + seg.res <- joinSegmentNeighbours(seg.res) + diagnostic.plot <- diagnostic.plot.old + + # get the new density + args.Mclust[["score.gr"]]=seg.res + args.Mclust[["diagnostic.plot"]]=diagnostic.plot + # skip second progress bar + args.Mclust[["verbose"]]=FALSE + dens=do.call("densityFind", args.Mclust ) + + } + return(seg.res) +} + + #' Segment methylation or differential methylation profile #' #' The function uses a segmentation algorithm (\code{\link[fastseg]{fastseg}}) @@ -30,6 +135,8 @@ #' integer higher than 1 to define the number of regions to sample. #' By default uses the whole dataset, which can be time consuming on #' large datasets. (Default: 1) +#' @param cores number of cores. The function is parallelized per chromosome +#' (default: 1). #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg #' package, or to \code{\link[mclust]{densityMclust}} #' in Mclust package, could be used to fine tune the segmentation algorithm. @@ -88,126 +195,119 @@ #' @docType methods #' @rdname methSeg methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, - initialize.on.subset=1, ...){ - - dots <- list(...) - - - ##coerce object to granges - if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { - obj= as(obj,"GRanges") - ## calculate methylation score - mcols(obj)$meth=100*obj$numCs/obj$coverage - ## select only required mcol - obj = obj[,"meth"] - }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") { - obj = as(obj,"GRanges") - ## use methylation difference as score - obj = obj[,"meth.diff"] - }else if (class(obj) != "GRanges"){ - stop("only methylRaw or methylDiff objects ", - "or GRanges objects can be used in this function") - } - - # destrand - strand(obj) <- "*" - - ## check wether obj contains at least one metacol - if(ncol(elementMetadata(obj))<1) - stop("GRanges does not have any meta column.") - - ## check wether obj contains is sorted by position - if(is.unsorted(obj,ignore.strand=TRUE)) { - obj <- sort(obj,ignore.strand=TRUE) - message("Object not sorted by position, sorting now.") - } - - ## check wether obj contains at least two ranges else stop - if(length(obj)<=1) - stop("segmentation requires at least two ranges.") - - # match argument names to fastseg arguments - args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ] + initialize.on.subset=1, + cores=1, ...){ - # match argument names to Mclust - args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1]) ] + # 1. Run segmentation - args.fastseg[["x"]]=obj - - # do the segmentation - #seg.res=fastseg(obj) - seg.res <- do.call("fastseg", args.fastseg) - #seg.res <- do.call("fastseg2", args.fastseg) - - # stop if segmentation produced only one range - if(length(seg.res)==1) { - warning("segmentation produced only one range, no mixture modeling possible.") - seg.res$seg.group <- "1" - return(seg.res) - } - - # if joining, do not show first clustering - if(join.neighbours) { - diagnostic.plot.old = diagnostic.plot - diagnostic.plot = FALSE + if(cores==1){ + + ##coerce object to granges + if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { + obj= as(obj,"GRanges") + ## calculate methylation score + mcols(obj)$meth=100*obj$numCs/obj$coverage + ## select only required mcol + obj = obj[,"meth"] + }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") { + obj = as(obj,"GRanges") + ## use methylation difference as score + obj = obj[,"meth.diff"] + }else if (class(obj) != "GRanges"){ + stop("only methylRaw or methylDiff objects ", + "or GRanges objects can be used in this function") + } + + # destrand + strand(obj) <- "*" + + # Run segmentation + seg.res = .run.fastseg(obj) } - if("initialization" %in% names(args.Mclust)){ - if("subset" %in% names(args.Mclust[["initialization"]])) { - if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){ - stop("too few samples, increase the size of subset.") + if(cores>1){ + + ## Non-tabix files + if (class(obj)=="methylDiff" | class(obj)=="methylRaw" | + class(obj)=="GRanges"){ + + ##coerce object to granges + if(class(obj)=="methylRaw") { + obj= as(obj,"GRanges") + ## calculate methylation score + mcols(obj)$meth=100*obj$numCs/obj$coverage + ## select only required mcol + obj = obj[,"meth"] + } else if(class(obj)=="methylDiff") { + obj = as(obj,"GRanges") + ## use methylation difference as score + obj = obj[,"meth.diff"] + }else if (class(obj) != "GRanges"){ + stop("only methylRaw or methylDiff objects ", + "or GRanges objects can be used in this function") + } + # destrand + strand(obj) <- "*" + + chrs = seqlevels(obj) + # Split the input object based on chromosome + # run methSeg on chromosomes (Could be in parallel or not) + seg.res.list <- mclapply(chrs, function(chr){ + seg.res = .run.fastseg(obj[seqnames(obj)==chr]) + }, mc.cores=cores) + + # Merge resulting segments again to whole genome. + # suppressWarnings -> combined objs dont have chrs in common + seg.res = suppressWarnings( do.call("c", seg.res.list) ) + + ## Tabix files + } else if(class(obj)=="methylDiffDB" | class(obj)=="methylRawDB"){ + + .run.fastseg.tabix = function(gr0, ...){ + + # adjust colnames of input GRanges to + # methylKit naming convention + df2getcolnames = as.data.frame(gr0[1]) + df2getcolnames$width = NULL + methylKit:::.setMethylDBNames(df2getcolnames) + names(mcols(gr0)) = colnames(df2getcolnames)[5:ncol(df2getcolnames)] + + if("coverage" %in% names(mcols(gr0))){ + gr0$meth = 100*gr0$numCs/gr0$coverage + gr0 = gr0[,"meth"] + }else if("meth.diff" %in% names(mcols(gr0))){ + gr0 = gr0[,"meth.diff"] + }else if (class(obj) != "GRanges"){ + stop("only methylRaw or methylDiff objects ", + "or GRanges objects can be used in this function") } - message(paste("initializing clustering with", - length(args.Mclust[["initialization"]][["subset"]]), - "segments.")) - initialize.on.subset = 1 + # destrand + strand(gr0) <- "*" + + .run.fastseg(gr0, ...) + } + + seg.res <- applyTbxByChr(obj@dbpath, + return.type = "GRanges", + FUN = .run.fastseg.tabix, + ..., + mc.cores = cores) } } - if(initialize.on.subset != 1 && initialize.on.subset > 0 ) { - - if( initialize.on.subset > 0 & initialize.on.subset < 1 ) - nbr.sample = floor(length(seg.res) * initialize.on.subset) - - if( initialize.on.subset > 1) - nbr.sample = initialize.on.subset - - if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") } - - message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments.")) - # estimate parameters for mclust - sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE) - args.Mclust[["initialization"]]=list(subset = sub) - } - + # 2. Density Estimation via Model-Based Clustering - # decide on number of components/groups - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - dens=do.call("densityFind", args.Mclust ) - - - # add components/group ids - mcols(seg.res)$seg.group=as.character(dens$classification) - - # if joining, show clustering after joining - if(join.neighbours) { - message("joining neighbouring segments and repeating clustering.") - seg.res <- joinSegmentNeighbours(seg.res) - diagnostic.plot <- diagnostic.plot.old - - # get the new density - args.Mclust[["score.gr"]]=seg.res - args.Mclust[["diagnostic.plot"]]=diagnostic.plot - # skip second progress bar - args.Mclust[["verbose"]]=FALSE - dens=do.call("densityFind", args.Mclust ) - + if( length(seg.res) > 1 ){ + seg.res = .run.mclust(seg.res, + diagnostic.plot=diagnostic.plot, + join.neighbours=join.neighbours, + initialize.on.subset=initialize.on.subset, + cores=cores, ...) } - - seg.res + return(seg.res) } + # not needed .methSeg<-function(score.gr,score.cols=NULL,...){ #require(fastseg) diff --git a/R/tabix.functions.R b/R/tabix.functions.R index d8d2400f..73d37484 100644 --- a/R/tabix.functions.R +++ b/R/tabix.functions.R @@ -484,7 +484,7 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename, } -# applyTbxByCHr +# applyTbxByChr #' Apply a function on tabix files chromosome by chromosome #' #' The function reads a tabix file chromosome by chromosome and applies @@ -511,7 +511,8 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename, #' @return either a path to a tabix or text file, or a data frame or data.table #' @noRd applyTbxByChr<-function(tbxFile,chrs,dir,filename, - return.type=c("tabix","data.frame","data.table"), + return.type=c("tabix","data.frame","data.table", + "GRanges"), FUN,...,mc.cores=1){ return.type <- match.arg(return.type) @@ -519,7 +520,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, if(Sys.info()['sysname']=="Windows") {mc.cores = 1} if(missing(chrs)) { chrs = Rsamtools::seqnamesTabix(tbxFile)} if(return.type =="tabix"){ - + # create a custom function that contains the function # to be applied myFunc<-function(chr,tbxFile,dir,filename,FUN,...){ @@ -539,11 +540,11 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, res=mclapply(chrs,myFunc,tbxFile,dir,filename2,FUN,...,mc.cores = mc.cores) # collect & cat temp files,then make tabix - + path <- catsub2tabix(dir,filename2,filename) return(gsub(".tbi","",path)) - + }else if(return.type=="data.frame"){ @@ -558,7 +559,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, # collect and return data.frame(rbindlist(res)) - }else{ + }else if(return.type=="data.table"){ myFunc3<-function(chr,tbxFile,FUN,...){ data=getTabixByChr(chr = chr,tbxFile,return.type="data.table") @@ -569,6 +570,17 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename, # collect and return rbindlist(res) + }else{ + myFunc4<-function(chr,tbxFile,FUN,...){ + data=getTabixByChr(chr = chr,tbxFile,return.type="GRanges") + res=FUN(data,...) + } + + res=mclapply(chrs,myFunc4,tbxFile,FUN,...,mc.cores = mc.cores) + + # collect and return + # suppressWarnings -> combined objs dont have chrs in common + suppressWarnings( do.call("c", res) ) } } diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r index 36705501..eba810f6 100644 --- a/tests/testthat/test-8-methSeg.r +++ b/tests/testthat/test-8-methSeg.r @@ -25,6 +25,27 @@ test_that("check if methSeg returns error for methylBase objects" ,{ expect_error(methSeg(methylBase.obj,diagnostic.plot = FALSE)) }) +test_that("check if methSeg with cores > 1 is the same as cores=1" ,{ + a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE) + b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) + +test_that("check if methSeg with cores > 1 is the same as cores=1 (non-tabix file)" ,{ + a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE) + b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) + +methylRawDB.obj <- methRead( + system.file("extdata", "control1.myCpG.txt", package = "methylKit"), + sample.id = "ctrl1", assembly = "hg18") + +test_that("check if methSeg with cores > 1 is the same as cores=1 (tabix file)" ,{ + a=methSeg(methylRawDB.obj,diagnostic.plot = FALSE) + b=methSeg(methylRawDB.obj,diagnostic.plot = FALSE, cores=2) + expect_equal(a,b) +}) gr = as(methylRawList.obj[[1]],"GRanges") mcols(gr)$meth=100*gr$numCs/gr$coverage From 6e9dd280dad60669096aeaa1220383c41fd6b0f7 Mon Sep 17 00:00:00 2001 From: katwre Date: Thu, 28 Jun 2018 18:28:30 +0200 Subject: [PATCH 09/13] modified man/methSeg.Rd --- man/methSeg.Rd | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/man/methSeg.Rd b/man/methSeg.Rd index 83b615a9..a734737f 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -6,7 +6,7 @@ \title{Segment methylation or differential methylation profile} \usage{ methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, - initialize.on.subset = 1, ...) + initialize.on.subset = 1, cores = 1, ...) } \arguments{ \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, @@ -34,6 +34,9 @@ integer higher than 1 to define the number of regions to sample. By default uses the whole dataset, which can be time consuming on large datasets. (Default: 1)} +\item{cores}{number of cores. The function is parallelized per chromosome +(default: 1).} + \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg package, or to \code{\link[mclust]{densityMclust}} in Mclust package, could be used to fine tune the segmentation algorithm. @@ -74,7 +77,7 @@ After initial segmentation with fastseg(), segments are clustered \examples{ \donttest{ - download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds", + download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds", destfile="H1.chr21.chr22.rds",method="curl") mbw=readRDS("H1.chr21.chr22.rds") From e628ede1586a1907cf4866764e860a3c0f5c2241 Mon Sep 17 00:00:00 2001 From: katwre Date: Fri, 29 Jun 2018 15:22:11 +0200 Subject: [PATCH 10/13] mc.cores, tests methylRawDB and other improvem. --- R/methSeg.R | 30 +++++++++++------------------- man/methSeg.Rd | 4 ++-- tests/testthat/test-8-methSeg.r | 4 ++-- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index 99702204..ebacbe58 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -31,7 +31,6 @@ if(length(seg.res)==1) { warning("segmentation produced only one range, no mixture modeling possible.") seg.res$seg.group <- "1" - return(seg.res) } return(seg.res) } @@ -135,7 +134,7 @@ #' integer higher than 1 to define the number of regions to sample. #' By default uses the whole dataset, which can be time consuming on #' large datasets. (Default: 1) -#' @param cores number of cores. The function is parallelized per chromosome +#' @param mc.cores number of cores. The function is parallelized per chromosome #' (default: 1). #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg #' package, or to \code{\link[mclust]{densityMclust}} @@ -196,11 +195,11 @@ #' @rdname methSeg methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, initialize.on.subset=1, - cores=1, ...){ + mc.cores=1, ...){ # 1. Run segmentation - if(cores==1){ + if(mc.cores==1){ ##coerce object to granges if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") { @@ -225,7 +224,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, seg.res = .run.fastseg(obj) } - if(cores>1){ + if(mc.cores>1){ ## Non-tabix files if (class(obj)=="methylDiff" | class(obj)=="methylRaw" | @@ -254,7 +253,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, # run methSeg on chromosomes (Could be in parallel or not) seg.res.list <- mclapply(chrs, function(chr){ seg.res = .run.fastseg(obj[seqnames(obj)==chr]) - }, mc.cores=cores) + }, mc.cores=mc.cores) # Merge resulting segments again to whole genome. # suppressWarnings -> combined objs dont have chrs in common @@ -263,24 +262,16 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, ## Tabix files } else if(class(obj)=="methylDiffDB" | class(obj)=="methylRawDB"){ - .run.fastseg.tabix = function(gr0, ...){ + .run.fastseg.tabix = function(gr0, class0, ...){ # adjust colnames of input GRanges to # methylKit naming convention df2getcolnames = as.data.frame(gr0[1]) df2getcolnames$width = NULL - methylKit:::.setMethylDBNames(df2getcolnames) + print(class0) + .setMethylDBNames(df2getcolnames, class0) names(mcols(gr0)) = colnames(df2getcolnames)[5:ncol(df2getcolnames)] - if("coverage" %in% names(mcols(gr0))){ - gr0$meth = 100*gr0$numCs/gr0$coverage - gr0 = gr0[,"meth"] - }else if("meth.diff" %in% names(mcols(gr0))){ - gr0 = gr0[,"meth.diff"] - }else if (class(obj) != "GRanges"){ - stop("only methylRaw or methylDiff objects ", - "or GRanges objects can be used in this function") - } # destrand strand(gr0) <- "*" @@ -290,8 +281,9 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, seg.res <- applyTbxByChr(obj@dbpath, return.type = "GRanges", FUN = .run.fastseg.tabix, + class = class(obj), ..., - mc.cores = cores) + mc.cores = mc.cores) } } @@ -302,7 +294,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, diagnostic.plot=diagnostic.plot, join.neighbours=join.neighbours, initialize.on.subset=initialize.on.subset, - cores=cores, ...) + ...) } return(seg.res) } diff --git a/man/methSeg.Rd b/man/methSeg.Rd index a734737f..e798ab85 100644 --- a/man/methSeg.Rd +++ b/man/methSeg.Rd @@ -6,7 +6,7 @@ \title{Segment methylation or differential methylation profile} \usage{ methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, - initialize.on.subset = 1, cores = 1, ...) + initialize.on.subset = 1, mc.cores = 1, ...) } \arguments{ \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, @@ -34,7 +34,7 @@ integer higher than 1 to define the number of regions to sample. By default uses the whole dataset, which can be time consuming on large datasets. (Default: 1)} -\item{cores}{number of cores. The function is parallelized per chromosome +\item{mc.cores}{number of cores. The function is parallelized per chromosome (default: 1).} \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r index eba810f6..1af1b7b2 100644 --- a/tests/testthat/test-8-methSeg.r +++ b/tests/testthat/test-8-methSeg.r @@ -37,9 +37,9 @@ test_that("check if methSeg with cores > 1 is the same as cores=1 (non-tabix fil expect_equal(a,b) }) -methylRawDB.obj <- methRead( +methylRawDB.obj <- suppressMessages( methRead( system.file("extdata", "control1.myCpG.txt", package = "methylKit"), - sample.id = "ctrl1", assembly = "hg18") + sample.id = "ctrl1", assembly = "hg18", dbtype="tabix") ) test_that("check if methSeg with cores > 1 is the same as cores=1 (tabix file)" ,{ a=methSeg(methylRawDB.obj,diagnostic.plot = FALSE) From aff6561b0f1a2f565526b7098f5dcfb68e3bba6c Mon Sep 17 00:00:00 2001 From: katwre Date: Mon, 2 Jul 2018 12:13:14 +0200 Subject: [PATCH 11/13] Add more comments --- R/methSeg.R | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/R/methSeg.R b/R/methSeg.R index ebacbe58..1f6101c8 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -1,5 +1,8 @@ # segmentation functions +#' # Segment methylation calls. +#' Run fastseg GRanges object `obj` +#' with additional parameters in `...`. .run.fastseg = function(obj, ...){ dots <- list(...) @@ -35,7 +38,12 @@ return(seg.res) } - +#' # Use mixture modeling for the density function estimated +#' # and BIC criterion used to decide the optimum number of components +#' # in mixture modeling. +#' Run mclust::densityMclust using GRanges object `seg.res`, look for description +#' of `join.neighbours`, `initialize.on.subset` and other parameters +#' in methylKit methSeg. .run.mclust = function(seg.res, diagnostic.plot=TRUE, join.neighbours=FALSE, initialize.on.subset=1, ...){ @@ -197,7 +205,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, initialize.on.subset=1, mc.cores=1, ...){ - # 1. Run segmentation + # 1. Run segmentation using fastseg if(mc.cores==1){ @@ -252,6 +260,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, # Split the input object based on chromosome # run methSeg on chromosomes (Could be in parallel or not) seg.res.list <- mclapply(chrs, function(chr){ + + # Run segmentation seg.res = .run.fastseg(obj[seqnames(obj)==chr]) }, mc.cores=mc.cores) @@ -264,8 +274,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, .run.fastseg.tabix = function(gr0, class0, ...){ - # adjust colnames of input GRanges to - # methylKit naming convention + # rename names of meta columns of input GRanges `gr0` to + # methylKit naming convention (such as e.g. coverage, numCs, numTs) df2getcolnames = as.data.frame(gr0[1]) df2getcolnames$width = NULL print(class0) @@ -275,9 +285,11 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, # destrand strand(gr0) <- "*" + # Run segmentation .run.fastseg(gr0, ...) } + # Run segmentation for each chromosome seg.res <- applyTbxByChr(obj@dbpath, return.type = "GRanges", FUN = .run.fastseg.tabix, @@ -287,7 +299,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, } } - # 2. Density Estimation via Model-Based Clustering + # 2. Density Estimation via Model-Based Clustering using + # mclust::densityMclust function if( length(seg.res) > 1 ){ seg.res = .run.mclust(seg.res, From f8020c0bb942e432014c1b00b2df8fcd43337374 Mon Sep 17 00:00:00 2001 From: katwre Date: Tue, 3 Jul 2018 10:12:27 +0200 Subject: [PATCH 12/13] don't subsample segments after joining neighbouring segments --- R/methSeg.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/methSeg.R b/R/methSeg.R index 1f6101c8..d8eb92bb 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -58,6 +58,7 @@ diagnostic.plot = FALSE } + # Run subsampling of segments before performing the mixture modeling if("initialization" %in% names(args.Mclust)){ if("subset" %in% names(args.Mclust[["initialization"]])) { if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){ @@ -105,6 +106,9 @@ args.Mclust[["diagnostic.plot"]]=diagnostic.plot # skip second progress bar args.Mclust[["verbose"]]=FALSE + # do not run subsampling of segments + args.Mclust[["initialization"]]<-NULL + dens=do.call("densityFind", args.Mclust ) } From f747c677ffc83e2ad4e41dd9157bca4e78fe2b20 Mon Sep 17 00:00:00 2001 From: katwre Date: Sun, 15 Jul 2018 21:42:54 +0200 Subject: [PATCH 13/13] keep only seqlevels that are non-empty --- R/methSeg.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/methSeg.R b/R/methSeg.R index d8eb92bb..687ad47b 100644 --- a/R/methSeg.R +++ b/R/methSeg.R @@ -259,7 +259,9 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE, } # destrand strand(obj) <- "*" - + + # keep only seqlevels that are non-empty + seqlevels(obj) <- seqlevelsInUse(obj) chrs = seqlevels(obj) # Split the input object based on chromosome # run methSeg on chromosomes (Could be in parallel or not)