From c75ce35ace303dd1a8df507a44cfda3e70f3fb4d Mon Sep 17 00:00:00 2001
From: Katarzyna Wreczycka <katwre@gmail.com>
Date: Wed, 13 Jun 2018 16:51:58 +0200
Subject: [PATCH 01/13] added estimate.params.density param to methSeg

---
 R/methSeg.R | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index dfbc240c..b2d411f5 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -23,6 +23,13 @@
 #' @param join.neighbours if TRUE neighbouring segments that cluster to the same 
 #'        seg.group are joined by extending the ranges, summing up num.marks and
 #'        averaging over seg.means.
+#' @param estimate.params.density a numeric value indicating percentage of regions
+#'        to sample and then estimate initial parameters (G and modelName) for a
+#'        function to calculate density (\code{\link[mclust]{densityMclust}}). 
+#'        The value can be between 0 and 1, e.g. 0.1 means that 10% of data will 
+#'        be used for sampling, otherwise it uses the whole dataset (Default: 1) 
+#'        and it's time consuming on large datasets. If 0 or 1 then the function 
+#'        will be executed without sampling. 
 #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 #'        package, or to \code{\link[mclust]{densityMclust}}
 #'        in Mclust package, could be used to fine tune the segmentation algorithm.
@@ -62,14 +69,15 @@
 #' 
 #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE))
 #' 
-#' @author Altuna Akalin, contributions by Arsene Wabo
+#' @author Altuna Akalin, contributions by Arsene Waboand Katarzyna Wreczycka
 #' 
 #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} 
 #' 
 #' @export
 #' @docType methods
 #' @rdname methSeg       
-methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,  ...){
+methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
+                  estimate.params.density=1, ...){
   
   dots <- list(...)  
   
@@ -133,10 +141,25 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,  ...){
     diagnostic.plot = FALSE
   }
   
-  # decide on number of components/groups
-  args.Mclust[["score.gr"]]=seg.res
-  args.Mclust[["diagnostic.plot"]]=diagnostic.plot
-  dens=do.call("densityFind", args.Mclust  )
+  if(estimate.params.density>0 & estimate.params.density<1 ){
+    
+    nbr.sample = floor(length(seg.res) * estimate.params.density)
+    # estimate parameters for mclust
+    args.Mclust.esti = args.Mclust
+    args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ]
+    args.Mclust.esti[["diagnostic.plot"]]=FALSE
+    dens_estimate=do.call("densityFind", args.Mclust.esti)
+
+    args.Mclust[["modelName"]] = dens_estimate$modelName
+    args.Mclust[["G"]] = 1:dens_estimate$G
+   }  
+  
+  if(!join.neighbours) {
+    # decide on number of components/groups
+    args.Mclust[["score.gr"]]=seg.res
+    args.Mclust[["diagnostic.plot"]]=diagnostic.plot
+    dens=do.call("densityFind", args.Mclust  )
+  }
   
   # add components/group ids 
   mcols(seg.res)$seg.group=as.character(dens$classification)

From 5378e6162695559f5ce2daecffd56b6576e0cbce Mon Sep 17 00:00:00 2001
From: Kasia Wreczycka <katarzyna.wreczycka@mdc-berlin.de>
Date: Thu, 14 Jun 2018 12:48:21 +0200
Subject: [PATCH 02/13] remove if clause

---
 R/methSeg.R    | 64 +++++++++++++++++++++++++-------------------------
 man/methSeg.Rd | 11 ++++++++-
 2 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index b2d411f5..1a05daaf 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -52,7 +52,7 @@
 #' @examples 
 #' 
 #' \donttest{
-#'  download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds",
+#'  download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds",
 #'                destfile="H1.chr21.chr22.rds",method="curl")
 #' 
 #'  mbw=readRDS("H1.chr21.chr22.rds")
@@ -69,7 +69,7 @@
 #' 
 #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE))
 #' 
-#' @author Altuna Akalin, contributions by Arsene Waboand Katarzyna Wreczycka
+#' @author Altuna Akalin, contributions by Arsene Wabo
 #' 
 #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} 
 #' 
@@ -84,15 +84,15 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
   
   ##coerce object to granges
   if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") {
-      obj= as(obj,"GRanges")
-      ## calculate methylation score 
-      mcols(obj)$meth=100*obj$numCs/obj$coverage
-      ## select only required mcol
-      obj = obj[,"meth"]
+    obj= as(obj,"GRanges")
+    ## calculate methylation score 
+    mcols(obj)$meth=100*obj$numCs/obj$coverage
+    ## select only required mcol
+    obj = obj[,"meth"]
   }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") {
-      obj = as(obj,"GRanges")
-      ## use methylation difference as score
-      obj = obj[,"meth.diff"]
+    obj = as(obj,"GRanges")
+    ## use methylation difference as score
+    obj = obj[,"meth.diff"]
   }else if (class(obj) != "GRanges"){
     stop("only methylRaw or methylDiff objects ", 
          "or GRanges objects can be used in this function")
@@ -149,34 +149,34 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
     args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ]
     args.Mclust.esti[["diagnostic.plot"]]=FALSE
     dens_estimate=do.call("densityFind", args.Mclust.esti)
-
+    
     args.Mclust[["modelName"]] = dens_estimate$modelName
     args.Mclust[["G"]] = 1:dens_estimate$G
-   }  
+  }  
+  
+  
+  # decide on number of components/groups
+  args.Mclust[["score.gr"]]=seg.res
+  args.Mclust[["diagnostic.plot"]]=diagnostic.plot
+  dens=do.call("densityFind", args.Mclust  )
   
-  if(!join.neighbours) {
-    # decide on number of components/groups
-    args.Mclust[["score.gr"]]=seg.res
-    args.Mclust[["diagnostic.plot"]]=diagnostic.plot
-    dens=do.call("densityFind", args.Mclust  )
-  }
   
   # add components/group ids 
   mcols(seg.res)$seg.group=as.character(dens$classification)
   
   # if joining, show clustering after joining
   if(join.neighbours) {
-      message("joining neighbouring segments and repeating clustering.")
-      seg.res <- joinSegmentNeighbours(seg.res)
-      diagnostic.plot <- diagnostic.plot.old
-      
-      # get the new density
-      args.Mclust[["score.gr"]]=seg.res
-      args.Mclust[["diagnostic.plot"]]=diagnostic.plot
-      # skip second progress bar
-      args.Mclust[["verbose"]]=FALSE
-      dens=do.call("densityFind", args.Mclust  )
-      
+    message("joining neighbouring segments and repeating clustering.")
+    seg.res <- joinSegmentNeighbours(seg.res)
+    diagnostic.plot <- diagnostic.plot.old
+    
+    # get the new density
+    args.Mclust[["score.gr"]]=seg.res
+    args.Mclust[["diagnostic.plot"]]=diagnostic.plot
+    # skip second progress bar
+    args.Mclust[["verbose"]]=FALSE
+    dens=do.call("densityFind", args.Mclust  )
+    
   }
   
   seg.res
@@ -255,9 +255,9 @@ diagPlot<-function(dens,score.gr){
 #' @docType methods
 #' @rdname methSeg2bed
 methSeg2bed<-function(segments,filename,
-trackLine="track name='meth segments' description='meth segments' itemRgb=On",
-colramp=colorRamp(c("gray","green", "darkgreen"))
-                        ){
+                      trackLine="track name='meth segments' description='meth segments' itemRgb=On",
+                      colramp=colorRamp(c("gray","green", "darkgreen"))
+){
   if(class(segments)!="GRanges"){
     stop("segments object has to be of class GRanges")
   }
diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index 935e9ff4..6e764441 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -5,7 +5,8 @@
 \alias{methSeg}
 \title{Segment methylation or differential methylation profile}
 \usage{
-methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE, ...)
+methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE,
+  estimate.params.density = 1, ...)
 }
 \arguments{
 \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, 
@@ -25,6 +26,14 @@ in mixture modeling.}
 seg.group are joined by extending the ranges, summing up num.marks and
 averaging over seg.means.}
 
+\item{estimate.params.density}{a numeric value indicating percentage of regions
+to sample and then estimate initial parameters (G and modelName) for a
+function to calculate density (\code{\link[mclust]{densityMclust}}). 
+The value can be between 0 and 1, e.g. 0.1 means that 10% of data will 
+be used for sampling, otherwise it uses the whole dataset (Default: 1) 
+and it's time consuming on large datasets. If 0 or 1 then the function 
+will be executed without sampling.}
+
 \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 package, or to \code{\link[mclust]{densityMclust}}
 in Mclust package, could be used to fine tune the segmentation algorithm.

From eb85d019069df15d0d5433c8ac5f7af718c6d407 Mon Sep 17 00:00:00 2001
From: Kasia Wreczycka <katarzyna.wreczycka@mdc-berlin.de>
Date: Thu, 14 Jun 2018 13:17:50 +0200
Subject: [PATCH 03/13] fix previous commit

---
 R/methSeg.R    | 3 ++-
 man/methSeg.Rd | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index 1a05daaf..f151c558 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -69,7 +69,8 @@
 #' 
 #' unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE))
 #' 
-#' @author Altuna Akalin, contributions by Arsene Wabo
+#' @author Altuna Akalin, contributions by Arsene Wabo and Katarzyna
+#' Wreczycka
 #' 
 #' @seealso \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}} 
 #' 
diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index 6e764441..5567ff73 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -87,5 +87,6 @@ unlink(list.files(pattern="H1.chr21.chr22",full.names=TRUE))
 \code{\link{methSeg2bed}}, \code{\link{joinSegmentNeighbours}}
 }
 \author{
-Altuna Akalin, contributions by Arsene Wabo
+Altuna Akalin, contributions by Arsene Wabo and Katarzyna
+Wreczycka
 }

From a51c93ebc2fd1bf7d2474e0721c1ac9365c56bf0 Mon Sep 17 00:00:00 2001
From: alexg9010 <alex.gos90@gmail.com>
Date: Thu, 14 Jun 2018 18:06:17 +0200
Subject: [PATCH 04/13] Updates to @katwre pull request: - changed parameter
 name from `estimate.params.density` to `initialize.on.subset` - allow for
 values higher than 1, which directly yields number of samples - priorize
 Mclust argument `initialization` - check for to few samples, 9 seems to be
 the magic number - add tests

---
 R/methSeg.R                     | 50 ++++++++++++++++++++++-----------
 man/methSeg.Rd                  | 16 +++++------
 tests/testthat/test-8-methSeg.r |  6 ++++
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index f151c558..94f0d128 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -23,13 +23,13 @@
 #' @param join.neighbours if TRUE neighbouring segments that cluster to the same 
 #'        seg.group are joined by extending the ranges, summing up num.marks and
 #'        averaging over seg.means.
-#' @param estimate.params.density a numeric value indicating percentage of regions
-#'        to sample and then estimate initial parameters (G and modelName) for a
-#'        function to calculate density (\code{\link[mclust]{densityMclust}}). 
-#'        The value can be between 0 and 1, e.g. 0.1 means that 10% of data will 
-#'        be used for sampling, otherwise it uses the whole dataset (Default: 1) 
-#'        and it's time consuming on large datasets. If 0 or 1 then the function 
-#'        will be executed without sampling. 
+#' @param initialize.on.subset a numeric value indicating either percentage or
+#'        absolute value of regions to subsample from segments before performing
+#'        the mixture modeling. The value can be either between 0 and 1, 
+#'        e.g. 0.1 means that 10% of data will be used for sampling, or be an 
+#'        integer higher than 1 to define the number of regions to sample. 
+#'        By default uses the whole dataset, which can be time consuming on 
+#'        large datasets. (Default: 1)
 #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 #'        package, or to \code{\link[mclust]{densityMclust}}
 #'        in Mclust package, could be used to fine tune the segmentation algorithm.
@@ -78,7 +78,7 @@
 #' @docType methods
 #' @rdname methSeg       
 methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
-                  estimate.params.density=1, ...){
+                  initialize.on.subset=1, ...){
   
   dots <- list(...)  
   
@@ -142,17 +142,33 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
     diagnostic.plot = FALSE
   }
   
-  if(estimate.params.density>0 & estimate.params.density<1 ){
+  if("initialization" %in% names(args.Mclust)){
+    if("subset" %in% names(args.Mclust[["initialization"]])) {
+      if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){
+        stop("too few samples, increase the size of subset.") 
+        }
+      message(paste("initializing clustering with",
+                    length(args.Mclust[["initialization"]][["subset"]]),
+                    "segments."))
+      initialize.on.subset = 1
+    }
+  }
+  
+  if(initialize.on.subset != 1 && initialize.on.subset > 0 ) {
     
-    nbr.sample = floor(length(seg.res) * estimate.params.density)
-    # estimate parameters for mclust
-    args.Mclust.esti = args.Mclust
-    args.Mclust.esti[["score.gr"]]=seg.res[ sample(1:length(seg.res), nbr.sample) ]
-    args.Mclust.esti[["diagnostic.plot"]]=FALSE
-    dens_estimate=do.call("densityFind", args.Mclust.esti)
+    if( initialize.on.subset > 0 & initialize.on.subset < 1 )
+      nbr.sample = floor(length(seg.res) * initialize.on.subset)
     
-    args.Mclust[["modelName"]] = dens_estimate$modelName
-    args.Mclust[["G"]] = 1:dens_estimate$G
+    if( initialize.on.subset > 1) 
+      nbr.sample = initialize.on.subset
+    
+    if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") }
+    
+    message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments."))
+    # estimate parameters for mclust
+    sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE)
+    if(length(sub) < 9 ){stop("too few samples, increase the size of subset.") }
+    args.Mclust[["initialization"]]=list(subset = sub)
   }  
   
   
diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index 5567ff73..6f1a2765 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -6,7 +6,7 @@
 \title{Segment methylation or differential methylation profile}
 \usage{
 methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE,
-  estimate.params.density = 1, ...)
+  initialize.on.subset = 1, ...)
 }
 \arguments{
 \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, 
@@ -26,13 +26,13 @@ in mixture modeling.}
 seg.group are joined by extending the ranges, summing up num.marks and
 averaging over seg.means.}
 
-\item{estimate.params.density}{a numeric value indicating percentage of regions
-to sample and then estimate initial parameters (G and modelName) for a
-function to calculate density (\code{\link[mclust]{densityMclust}}). 
-The value can be between 0 and 1, e.g. 0.1 means that 10% of data will 
-be used for sampling, otherwise it uses the whole dataset (Default: 1) 
-and it's time consuming on large datasets. If 0 or 1 then the function 
-will be executed without sampling.}
+\item{initialize.on.subset}{a numeric value indicating either percentage or
+absolute value of regions to subsample from segments before performing
+the mixture modeling. The value can be either between 0 and 1, 
+e.g. 0.1 means that 10% of data will be used for sampling, or be an 
+integer higher than 1 to define the number of regions to sample. 
+By default uses the whole dataset, which can be time consuming on 
+large datasets. (Default: 1)}
 
 \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 package, or to \code{\link[mclust]{densityMclust}}
diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r
index 72a8edb5..36705501 100644
--- a/tests/testthat/test-8-methSeg.r
+++ b/tests/testthat/test-8-methSeg.r
@@ -55,6 +55,12 @@ test_that("check if joining neighbours works" ,{
   expect_true(all(rle(res2$seg.group)$lengths == 1))
 })
 
+test_that("check if initialization works" ,{
+  expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE),"GRanges")
+  expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialization=list(subset = sample(1:13,9,replace = FALSE))),"GRanges")
+  expect_error(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 0.6))
+  expect_is(methSeg(methylDiff.obj,diagnostic.plot = FALSE,verbose=FALSE,initialize.on.subset = 10),"GRanges")
+})
 
 seg <- methSeg(tileRaw,diagnostic.plot = FALSE)
 methSeg2bed(segments = seg, filename = "test.bed")

From d3970a8ea75230fd645e72c4c8fc0087d0f4cfa7 Mon Sep 17 00:00:00 2001
From: Alexander Gosdschan <alexg9010@users.noreply.github.com>
Date: Sat, 16 Jun 2018 11:29:16 +0200
Subject: [PATCH 05/13] Update methSeg.R

Remove duplicate check for sample size.
---
 R/methSeg.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index 94f0d128..0bdb42eb 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -167,7 +167,6 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
     message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments."))
     # estimate parameters for mclust
     sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE)
-    if(length(sub) < 9 ){stop("too few samples, increase the size of subset.") }
     args.Mclust[["initialization"]]=list(subset = sub)
   }  
   

From b5df9d04a435c7f5167a7f14a11da4c2181d693d Mon Sep 17 00:00:00 2001
From: alexg9010 <alex.gos90@gmail.com>
Date: Mon, 18 Jun 2018 10:35:15 +0200
Subject: [PATCH 06/13] update documentation

---
 R/methSeg.R    | 10 ++++++++++
 man/methSeg.Rd |  9 +++++++++
 2 files changed, 19 insertions(+)

diff --git a/R/methSeg.R b/R/methSeg.R
index 0bdb42eb..7dae17ff 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -49,6 +49,16 @@
 #' @details      
 #'        To be sure that the algorithm will work on your data, 
 #'        the object should have at least 5000 records
+#'        
+#' @details After initial segmentation with fastseg(), segments are clustered 
+#'        into self-similar groups based on their mean methylation profile 
+#'        using mixture modeling. Mixture modeling estimates the initial 
+#'        parameters of the distributions by using the whole dataset. 
+#'        If "initialize.on.subset" argument set as described, the function 
+#'        will use a subset of the data to initialize the parameters of the 
+#'        mixture modeling prior to the Expectation-maximization (EM) algorithm 
+#'        used by \code{\link{Mclust}} package.
+#'  
 #' @examples 
 #' 
 #' \donttest{
diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index 6f1a2765..83b615a9 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -61,6 +61,15 @@ as high or low methylated regions.
 \details{
 To be sure that the algorithm will work on your data, 
        the object should have at least 5000 records
+
+After initial segmentation with fastseg(), segments are clustered 
+       into self-similar groups based on their mean methylation profile 
+       using mixture modeling. Mixture modeling estimates the initial 
+       parameters of the distributions by using the whole dataset. 
+       If "initialize.on.subset" argument set as described, the function 
+       will use a subset of the data to initialize the parameters of the 
+       mixture modeling prior to the Expectation-maximization (EM) algorithm 
+       used by \code{\link{Mclust}} package.
 }
 \examples{
 

From 75b949f1914bf0a7d9a0dc6cdf085e2e5079b945 Mon Sep 17 00:00:00 2001
From: alexg9010 <alex.gos90@gmail.com>
Date: Mon, 18 Jun 2018 10:54:16 +0200
Subject: [PATCH 07/13] update NEWS and version bump

---
 DESCRIPTION | 5 +++--
 NEWS        | 8 ++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e754ae73..593a9da4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,14 +2,15 @@ Package: methylKit
 Type: Package
 Title: DNA methylation analysis from high-throughput
     bisulfite sequencing results
-Version: 1.7.4
+Version: 1.7.5
 Authors@R: c(person("Altuna", "Akalin", role = c("aut", "cre"),
                     email = "aakalin@gmail.com"),
 	           person("Matthias","Kormaksson", role = "aut"),
 	           person("Sheng","Li", role = "aut"),
 	           person("Arsene", "Wabo", role = "ctb"),
 	           person("Adrian", "Bierling", role= "aut"),
-	           person("Alexander","Gosdschan", role="aut"))
+	           person("Alexander","Gosdschan", role="aut"),
+	           person("Katarzyna","Wreczycka", role="ctb"))
 Author: Altuna Akalin [aut, cre], Matthias Kormaksson [aut], 
     Sheng Li [aut], Arsene Wabo [ctb], Adrian Bierling [aut], 
     Alexander Gosdschan [aut]
diff --git a/NEWS b/NEWS
index 079f646f..f363e11c 100644
--- a/NEWS
+++ b/NEWS
@@ -1,8 +1,16 @@
+methylKit 1.7.5
+--------------
+NEW FUNCTIONS AND FEATURES
+* methSeg(): introduce parameter `initialize.on.subset` to subset data
+  for initialization of mixture modeling; update description; add tests
+
+
 methylKit 1.7.4
 --------------
 IMPROVEMENTS AND BUG FIXES
 * update link to test-file for methSeg() function
 
+
 methylKit 1.7.3
 --------------
 IMPROVEMENTS AND BUG FIXES

From 4395aafb4e4632d86a78e7f4078d41d707008b6f Mon Sep 17 00:00:00 2001
From: Kasia Wreczycka <katarzyna.wreczycka@mdc-berlin.de>
Date: Thu, 28 Jun 2018 18:14:04 +0200
Subject: [PATCH 08/13] Added cores to methSeg

---
 DESCRIPTION                     |   2 +-
 NEWS                            |   7 +
 R/methSeg.R                     | 316 +++++++++++++++++++++-----------
 R/tabix.functions.R             |  24 ++-
 tests/testthat/test-8-methSeg.r |  21 +++
 5 files changed, 255 insertions(+), 115 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 593a9da4..02642479 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: methylKit
 Type: Package
 Title: DNA methylation analysis from high-throughput
     bisulfite sequencing results
-Version: 1.7.5
+Version: 1.7.6
 Authors@R: c(person("Altuna", "Akalin", role = c("aut", "cre"),
                     email = "aakalin@gmail.com"),
 	           person("Matthias","Kormaksson", role = "aut"),
diff --git a/NEWS b/NEWS
index f363e11c..b0aecf4b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,10 @@
+methylKit 1.7.6
+--------------
+NEW FUNCTIONS AND FEATURES
+* methSeg(): introduce parameter `cores` to run fastseg in parallel
+  per chromosome; update description; add tests
+
+
 methylKit 1.7.5
 --------------
 NEW FUNCTIONS AND FEATURES
diff --git a/R/methSeg.R b/R/methSeg.R
index 7dae17ff..99702204 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -1,5 +1,110 @@
 # segmentation functions 
 
+.run.fastseg = function(obj, ...){
+  
+  dots <- list(...)
+  
+  ## check wether obj contains at least one metacol 
+  if(ncol(elementMetadata(obj))<1)
+    stop("GRanges does not have any meta column.")
+  
+  ## check wether obj contains is sorted by position
+  if(is.unsorted(obj,ignore.strand=TRUE)) {
+    obj <- sort(obj,ignore.strand=TRUE)
+    message("Object not sorted by position, sorting now.")
+  }
+  
+  ## check wether obj contains at least two ranges else stop
+  if(length(obj)<=1)
+    stop("segmentation requires at least two ranges.")
+  
+  # match argument names to fastseg arguments
+  args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ]  
+  
+  args.fastseg[["x"]]=obj
+  
+  # do the segmentation
+  #seg.res=fastseg(obj)
+  seg.res <- do.call("fastseg", args.fastseg)
+  
+  # stop if segmentation produced only one range
+  if(length(seg.res)==1) {
+    warning("segmentation produced only one range, no mixture modeling possible.")
+    seg.res$seg.group <- "1"
+    return(seg.res)
+  }
+  return(seg.res)
+}
+
+
+.run.mclust = function(seg.res, diagnostic.plot=TRUE, join.neighbours=FALSE,
+                       initialize.on.subset=1, ...){
+  
+  dots <- list(...)
+  
+  # match argument names to Mclust
+  args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1])  ]
+  
+  # if joining, do not show first clustering
+  if(join.neighbours) {
+    diagnostic.plot.old = diagnostic.plot
+    diagnostic.plot = FALSE
+  }
+  
+  if("initialization" %in% names(args.Mclust)){
+    if("subset" %in% names(args.Mclust[["initialization"]])) {
+      if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){
+        stop("too few samples, increase the size of subset.") 
+      }
+      message(paste("initializing clustering with",
+                    length(args.Mclust[["initialization"]][["subset"]]),
+                    "segments."))
+      initialize.on.subset = 1
+    }
+  }
+  
+  if(initialize.on.subset != 1 && initialize.on.subset > 0 ) {
+    
+    if( initialize.on.subset > 0 & initialize.on.subset < 1 )
+      nbr.sample = floor(length(seg.res) * initialize.on.subset)
+    
+    if( initialize.on.subset > 1) 
+      nbr.sample = initialize.on.subset
+    
+    if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") }
+    
+    message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments."))
+    # estimate parameters for mclust
+    sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE)
+    args.Mclust[["initialization"]]=list(subset = sub)
+  }  
+  
+  # decide on number of components/groups
+  args.Mclust[["score.gr"]]=seg.res
+  args.Mclust[["diagnostic.plot"]]=diagnostic.plot
+  dens=do.call("densityFind", args.Mclust  )
+  
+  # add components/group ids 
+  mcols(seg.res)$seg.group=as.character(dens$classification)
+  
+  # if joining, show clustering after joining
+  if(join.neighbours) {
+    message("joining neighbouring segments and repeating clustering.")
+    seg.res <- joinSegmentNeighbours(seg.res)
+    diagnostic.plot <- diagnostic.plot.old
+    
+    # get the new density
+    args.Mclust[["score.gr"]]=seg.res
+    args.Mclust[["diagnostic.plot"]]=diagnostic.plot
+    # skip second progress bar
+    args.Mclust[["verbose"]]=FALSE
+    dens=do.call("densityFind", args.Mclust  )
+    
+  }
+  return(seg.res)
+}
+
+
 #' Segment methylation or differential methylation profile
 #' 
 #' The function uses a segmentation algorithm (\code{\link[fastseg]{fastseg}}) 
@@ -30,6 +135,8 @@
 #'        integer higher than 1 to define the number of regions to sample. 
 #'        By default uses the whole dataset, which can be time consuming on 
 #'        large datasets. (Default: 1)
+#' @param cores number of cores. The function is parallelized per chromosome
+#'        (default: 1).
 #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 #'        package, or to \code{\link[mclust]{densityMclust}}
 #'        in Mclust package, could be used to fine tune the segmentation algorithm.
@@ -88,126 +195,119 @@
 #' @docType methods
 #' @rdname methSeg       
 methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
-                  initialize.on.subset=1, ...){
-  
-  dots <- list(...)  
-  
-  
-  ##coerce object to granges
-  if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") {
-    obj= as(obj,"GRanges")
-    ## calculate methylation score 
-    mcols(obj)$meth=100*obj$numCs/obj$coverage
-    ## select only required mcol
-    obj = obj[,"meth"]
-  }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") {
-    obj = as(obj,"GRanges")
-    ## use methylation difference as score
-    obj = obj[,"meth.diff"]
-  }else if (class(obj) != "GRanges"){
-    stop("only methylRaw or methylDiff objects ", 
-         "or GRanges objects can be used in this function")
-  }
-  
-  # destrand
-  strand(obj) <- "*"
-  
-  ## check wether obj contains at least one metacol 
-  if(ncol(elementMetadata(obj))<1)
-    stop("GRanges does not have any meta column.")
-  
-  ## check wether obj contains is sorted by position
-  if(is.unsorted(obj,ignore.strand=TRUE)) {
-    obj <- sort(obj,ignore.strand=TRUE)
-    message("Object not sorted by position, sorting now.")
-  }
-  
-  ## check wether obj contains at least two ranges else stop
-  if(length(obj)<=1)
-    stop("segmentation requires at least two ranges.")
-  
-  # match argument names to fastseg arguments
-  args.fastseg=dots[names(dots) %in% names(formals(fastseg)[-1] ) ]  
+                  initialize.on.subset=1,
+                  cores=1, ...){
   
-  # match argument names to Mclust
-  args.Mclust=dots[names(dots) %in% names(formals(Mclust)[-1])  ]
+  # 1. Run segmentation
   
-  args.fastseg[["x"]]=obj
-  
-  # do the segmentation
-  #seg.res=fastseg(obj)
-  seg.res <- do.call("fastseg", args.fastseg)
-  #seg.res <- do.call("fastseg2", args.fastseg)
-  
-  # stop if segmentation produced only one range
-  if(length(seg.res)==1) {
-    warning("segmentation produced only one range, no mixture modeling possible.")
-    seg.res$seg.group <- "1"
-    return(seg.res)
-  }
-  
-  # if joining, do not show first clustering
-  if(join.neighbours) {
-    diagnostic.plot.old = diagnostic.plot
-    diagnostic.plot = FALSE
+  if(cores==1){
+    
+    ##coerce object to granges
+    if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") {
+      obj= as(obj,"GRanges")
+      ## calculate methylation score 
+      mcols(obj)$meth=100*obj$numCs/obj$coverage
+      ## select only required mcol
+      obj = obj[,"meth"]
+    }else if (class(obj)=="methylDiff" | class(obj)=="methylDiffDB") {
+      obj = as(obj,"GRanges")
+      ## use methylation difference as score
+      obj = obj[,"meth.diff"]
+    }else if (class(obj) != "GRanges"){
+      stop("only methylRaw or methylDiff objects ", 
+           "or GRanges objects can be used in this function")
+    }
+    
+    # destrand
+    strand(obj) <- "*"
+    
+    # Run segmentation
+    seg.res = .run.fastseg(obj)
   }
   
-  if("initialization" %in% names(args.Mclust)){
-    if("subset" %in% names(args.Mclust[["initialization"]])) {
-      if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){
-        stop("too few samples, increase the size of subset.") 
+  if(cores>1){
+    
+    ## Non-tabix files  
+    if (class(obj)=="methylDiff" | class(obj)=="methylRaw" | 
+        class(obj)=="GRanges"){
+      
+      ##coerce object to granges
+      if(class(obj)=="methylRaw") {
+        obj= as(obj,"GRanges")
+        ## calculate methylation score 
+        mcols(obj)$meth=100*obj$numCs/obj$coverage
+        ## select only required mcol
+        obj = obj[,"meth"]
+      } else if(class(obj)=="methylDiff") {
+        obj = as(obj,"GRanges")
+        ## use methylation difference as score
+        obj = obj[,"meth.diff"]
+      }else if (class(obj) != "GRanges"){
+        stop("only methylRaw or methylDiff objects ", 
+             "or GRanges objects can be used in this function")
+      }
+      # destrand
+      strand(obj) <- "*"
+      
+      chrs = seqlevels(obj)
+      # Split the input object based on chromosome
+      # run methSeg on chromosomes (Could be in parallel or not)
+      seg.res.list <- mclapply(chrs, function(chr){
+        seg.res = .run.fastseg(obj[seqnames(obj)==chr])
+      }, mc.cores=cores)
+      
+      # Merge resulting segments again to whole genome.
+      # suppressWarnings -> combined objs dont have chrs in common
+      seg.res = suppressWarnings( do.call("c", seg.res.list) )
+      
+      ## Tabix files
+    } else if(class(obj)=="methylDiffDB" | class(obj)=="methylRawDB"){
+      
+      .run.fastseg.tabix = function(gr0, ...){
+        
+        # adjust colnames of input GRanges to 
+        # methylKit naming convention
+        df2getcolnames = as.data.frame(gr0[1])
+        df2getcolnames$width = NULL 
+        methylKit:::.setMethylDBNames(df2getcolnames)
+        names(mcols(gr0)) = colnames(df2getcolnames)[5:ncol(df2getcolnames)]
+        
+        if("coverage" %in% names(mcols(gr0))){
+          gr0$meth = 100*gr0$numCs/gr0$coverage
+          gr0 = gr0[,"meth"]
+        }else if("meth.diff" %in% names(mcols(gr0))){
+          gr0 = gr0[,"meth.diff"]
+        }else if (class(obj) != "GRanges"){
+          stop("only methylRaw or methylDiff objects ", 
+               "or GRanges objects can be used in this function")
         }
-      message(paste("initializing clustering with",
-                    length(args.Mclust[["initialization"]][["subset"]]),
-                    "segments."))
-      initialize.on.subset = 1
+        # destrand
+        strand(gr0) <- "*"
+        
+        .run.fastseg(gr0, ...)
+      }
+      
+      seg.res <- applyTbxByChr(obj@dbpath,
+                               return.type = "GRanges",
+                               FUN = .run.fastseg.tabix,
+                               ...,
+                               mc.cores = cores)
     }
   }
   
-  if(initialize.on.subset != 1 && initialize.on.subset > 0 ) {
-    
-    if( initialize.on.subset > 0 & initialize.on.subset < 1 )
-      nbr.sample = floor(length(seg.res) * initialize.on.subset)
-    
-    if( initialize.on.subset > 1) 
-      nbr.sample = initialize.on.subset
-    
-    if( nbr.sample < 9 ){stop("too few samples, increase the size of subset.") }
-    
-    message(paste("initializing clustering with",nbr.sample,"out of",length(seg.res),"total segments."))
-    # estimate parameters for mclust
-    sub <- sample(1:length(seg.res), nbr.sample,replace = FALSE)
-    args.Mclust[["initialization"]]=list(subset = sub)
-  }  
-  
+  # 2. Density Estimation via Model-Based Clustering
   
-  # decide on number of components/groups
-  args.Mclust[["score.gr"]]=seg.res
-  args.Mclust[["diagnostic.plot"]]=diagnostic.plot
-  dens=do.call("densityFind", args.Mclust  )
-  
-  
-  # add components/group ids 
-  mcols(seg.res)$seg.group=as.character(dens$classification)
-  
-  # if joining, show clustering after joining
-  if(join.neighbours) {
-    message("joining neighbouring segments and repeating clustering.")
-    seg.res <- joinSegmentNeighbours(seg.res)
-    diagnostic.plot <- diagnostic.plot.old
-    
-    # get the new density
-    args.Mclust[["score.gr"]]=seg.res
-    args.Mclust[["diagnostic.plot"]]=diagnostic.plot
-    # skip second progress bar
-    args.Mclust[["verbose"]]=FALSE
-    dens=do.call("densityFind", args.Mclust  )
-    
+  if( length(seg.res) > 1 ){
+    seg.res = .run.mclust(seg.res,
+                          diagnostic.plot=diagnostic.plot, 
+                          join.neighbours=join.neighbours,
+                          initialize.on.subset=initialize.on.subset,
+                          cores=cores, ...)
   }
-  
-  seg.res
+  return(seg.res)
 }
 
+
 # not needed
 .methSeg<-function(score.gr,score.cols=NULL,...){
   #require(fastseg)
diff --git a/R/tabix.functions.R b/R/tabix.functions.R
index d8d2400f..73d37484 100644
--- a/R/tabix.functions.R
+++ b/R/tabix.functions.R
@@ -484,7 +484,7 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename,
 }
 
 
-# applyTbxByCHr
+# applyTbxByChr
 #' Apply a function on tabix files chromosome by chromosome 
 #' 
 #' The function reads a tabix file chromosome by chromosome and applies 
@@ -511,7 +511,8 @@ applyTbxByChunk<-function(tbxFile,chunk.size=1e6,dir,filename,
 #' @return either a path to a tabix or text file, or a data frame or data.table
 #' @noRd
 applyTbxByChr<-function(tbxFile,chrs,dir,filename,
-                        return.type=c("tabix","data.frame","data.table"),
+                        return.type=c("tabix","data.frame","data.table",
+                                      "GRanges"),
                         FUN,...,mc.cores=1){
   
   return.type <- match.arg(return.type)
@@ -519,7 +520,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename,
   if(Sys.info()['sysname']=="Windows") {mc.cores = 1}
   if(missing(chrs)) { chrs = Rsamtools::seqnamesTabix(tbxFile)}
   if(return.type =="tabix"){
-   
+    
     # create a custom function that contains the function
     # to be applied
     myFunc<-function(chr,tbxFile,dir,filename,FUN,...){
@@ -539,11 +540,11 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename,
     res=mclapply(chrs,myFunc,tbxFile,dir,filename2,FUN,...,mc.cores = mc.cores)
     
     # collect & cat temp files,then make tabix
-
+    
     path <- catsub2tabix(dir,filename2,filename)
     
     return(gsub(".tbi","",path))
-
+    
     
   }else if(return.type=="data.frame"){
     
@@ -558,7 +559,7 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename,
     
     # collect and return
     data.frame(rbindlist(res))
-  }else{
+  }else if(return.type=="data.table"){
     
     myFunc3<-function(chr,tbxFile,FUN,...){
       data=getTabixByChr(chr = chr,tbxFile,return.type="data.table")
@@ -569,6 +570,17 @@ applyTbxByChr<-function(tbxFile,chrs,dir,filename,
     
     # collect and return
     rbindlist(res)
+  }else{
+    myFunc4<-function(chr,tbxFile,FUN,...){
+      data=getTabixByChr(chr = chr,tbxFile,return.type="GRanges") 
+      res=FUN(data,...)  
+    }
+    
+    res=mclapply(chrs,myFunc4,tbxFile,FUN,...,mc.cores = mc.cores)
+    
+    # collect and return
+    # suppressWarnings -> combined objs dont have chrs in common
+    suppressWarnings( do.call("c", res) )
   }
 }
 
diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r
index 36705501..eba810f6 100644
--- a/tests/testthat/test-8-methSeg.r
+++ b/tests/testthat/test-8-methSeg.r
@@ -25,6 +25,27 @@ test_that("check if methSeg returns error for methylBase objects" ,{
   expect_error(methSeg(methylBase.obj,diagnostic.plot = FALSE))
 })
 
+test_that("check if methSeg with cores > 1 is the same as cores=1" ,{
+  a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE)
+  b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2)
+  expect_equal(a,b)
+})
+
+test_that("check if methSeg with cores > 1 is the same as cores=1 (non-tabix file)" ,{
+  a=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE)
+  b=methSeg(methylRawList.obj[[3]],diagnostic.plot = FALSE, cores=2)
+  expect_equal(a,b)
+})
+
+methylRawDB.obj <- methRead(
+  system.file("extdata", "control1.myCpG.txt", package = "methylKit"),
+  sample.id = "ctrl1", assembly = "hg18")
+
+test_that("check if methSeg with cores > 1 is the same as cores=1 (tabix file)" ,{
+  a=methSeg(methylRawDB.obj,diagnostic.plot = FALSE)
+  b=methSeg(methylRawDB.obj,diagnostic.plot = FALSE, cores=2)
+  expect_equal(a,b)
+})
 
 gr = as(methylRawList.obj[[1]],"GRanges")
 mcols(gr)$meth=100*gr$numCs/gr$coverage

From 6e9dd280dad60669096aeaa1220383c41fd6b0f7 Mon Sep 17 00:00:00 2001
From: katwre <katarzyna.wreczycka@mdc-berlin.de>
Date: Thu, 28 Jun 2018 18:28:30 +0200
Subject: [PATCH 09/13] modified man/methSeg.Rd

---
 man/methSeg.Rd | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index 83b615a9..a734737f 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -6,7 +6,7 @@
 \title{Segment methylation or differential methylation profile}
 \usage{
 methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE,
-  initialize.on.subset = 1, ...)
+  initialize.on.subset = 1, cores = 1, ...)
 }
 \arguments{
 \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, 
@@ -34,6 +34,9 @@ integer higher than 1 to define the number of regions to sample.
 By default uses the whole dataset, which can be time consuming on 
 large datasets. (Default: 1)}
 
+\item{cores}{number of cores. The function is parallelized per chromosome
+(default: 1).}
+
 \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 package, or to \code{\link[mclust]{densityMclust}}
 in Mclust package, could be used to fine tune the segmentation algorithm.
@@ -74,7 +77,7 @@ After initial segmentation with fastseg(), segments are clustered
 \examples{
 
 \donttest{
- download.file("https://github.com/BIMSBbioinfo/compgen2018/raw/master/day3_diffMeth/data/H1.chr21.chr22.rds",
+ download.file("https://dl.dropboxusercontent.com/u/1373164/H1.chr21.chr22.rds",
                destfile="H1.chr21.chr22.rds",method="curl")
 
  mbw=readRDS("H1.chr21.chr22.rds")

From e628ede1586a1907cf4866764e860a3c0f5c2241 Mon Sep 17 00:00:00 2001
From: katwre <katarzyna.wreczycka@mdc-berlin.de>
Date: Fri, 29 Jun 2018 15:22:11 +0200
Subject: [PATCH 10/13] mc.cores, tests methylRawDB and other improvem.

---
 R/methSeg.R                     | 30 +++++++++++-------------------
 man/methSeg.Rd                  |  4 ++--
 tests/testthat/test-8-methSeg.r |  4 ++--
 3 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index 99702204..ebacbe58 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -31,7 +31,6 @@
   if(length(seg.res)==1) {
     warning("segmentation produced only one range, no mixture modeling possible.")
     seg.res$seg.group <- "1"
-    return(seg.res)
   }
   return(seg.res)
 }
@@ -135,7 +134,7 @@
 #'        integer higher than 1 to define the number of regions to sample. 
 #'        By default uses the whole dataset, which can be time consuming on 
 #'        large datasets. (Default: 1)
-#' @param cores number of cores. The function is parallelized per chromosome
+#' @param mc.cores number of cores. The function is parallelized per chromosome
 #'        (default: 1).
 #' @param ... arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
 #'        package, or to \code{\link[mclust]{densityMclust}}
@@ -196,11 +195,11 @@
 #' @rdname methSeg       
 methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
                   initialize.on.subset=1,
-                  cores=1, ...){
+                  mc.cores=1, ...){
   
   # 1. Run segmentation
   
-  if(cores==1){
+  if(mc.cores==1){
     
     ##coerce object to granges
     if(class(obj)=="methylRaw" | class(obj)=="methylRawDB") {
@@ -225,7 +224,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
     seg.res = .run.fastseg(obj)
   }
   
-  if(cores>1){
+  if(mc.cores>1){
     
     ## Non-tabix files  
     if (class(obj)=="methylDiff" | class(obj)=="methylRaw" | 
@@ -254,7 +253,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       # run methSeg on chromosomes (Could be in parallel or not)
       seg.res.list <- mclapply(chrs, function(chr){
         seg.res = .run.fastseg(obj[seqnames(obj)==chr])
-      }, mc.cores=cores)
+      }, mc.cores=mc.cores)
       
       # Merge resulting segments again to whole genome.
       # suppressWarnings -> combined objs dont have chrs in common
@@ -263,24 +262,16 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       ## Tabix files
     } else if(class(obj)=="methylDiffDB" | class(obj)=="methylRawDB"){
       
-      .run.fastseg.tabix = function(gr0, ...){
+      .run.fastseg.tabix = function(gr0, class0, ...){
         
         # adjust colnames of input GRanges to 
         # methylKit naming convention
         df2getcolnames = as.data.frame(gr0[1])
         df2getcolnames$width = NULL 
-        methylKit:::.setMethylDBNames(df2getcolnames)
+        print(class0)
+        .setMethylDBNames(df2getcolnames, class0)
         names(mcols(gr0)) = colnames(df2getcolnames)[5:ncol(df2getcolnames)]
         
-        if("coverage" %in% names(mcols(gr0))){
-          gr0$meth = 100*gr0$numCs/gr0$coverage
-          gr0 = gr0[,"meth"]
-        }else if("meth.diff" %in% names(mcols(gr0))){
-          gr0 = gr0[,"meth.diff"]
-        }else if (class(obj) != "GRanges"){
-          stop("only methylRaw or methylDiff objects ", 
-               "or GRanges objects can be used in this function")
-        }
         # destrand
         strand(gr0) <- "*"
         
@@ -290,8 +281,9 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       seg.res <- applyTbxByChr(obj@dbpath,
                                return.type = "GRanges",
                                FUN = .run.fastseg.tabix,
+                               class = class(obj),
                                ...,
-                               mc.cores = cores)
+                               mc.cores = mc.cores)
     }
   }
   
@@ -302,7 +294,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
                           diagnostic.plot=diagnostic.plot, 
                           join.neighbours=join.neighbours,
                           initialize.on.subset=initialize.on.subset,
-                          cores=cores, ...)
+                          ...)
   }
   return(seg.res)
 }
diff --git a/man/methSeg.Rd b/man/methSeg.Rd
index a734737f..e798ab85 100644
--- a/man/methSeg.Rd
+++ b/man/methSeg.Rd
@@ -6,7 +6,7 @@
 \title{Segment methylation or differential methylation profile}
 \usage{
 methSeg(obj, diagnostic.plot = TRUE, join.neighbours = FALSE,
-  initialize.on.subset = 1, cores = 1, ...)
+  initialize.on.subset = 1, mc.cores = 1, ...)
 }
 \arguments{
 \item{obj}{\code{\link[GenomicRanges]{GRanges}}, \code{\link{methylDiff}}, 
@@ -34,7 +34,7 @@ integer higher than 1 to define the number of regions to sample.
 By default uses the whole dataset, which can be time consuming on 
 large datasets. (Default: 1)}
 
-\item{cores}{number of cores. The function is parallelized per chromosome
+\item{mc.cores}{number of cores. The function is parallelized per chromosome
 (default: 1).}
 
 \item{...}{arguments to \code{\link[fastseg]{fastseg}} function in fastseg 
diff --git a/tests/testthat/test-8-methSeg.r b/tests/testthat/test-8-methSeg.r
index eba810f6..1af1b7b2 100644
--- a/tests/testthat/test-8-methSeg.r
+++ b/tests/testthat/test-8-methSeg.r
@@ -37,9 +37,9 @@ test_that("check if methSeg with cores > 1 is the same as cores=1 (non-tabix fil
   expect_equal(a,b)
 })
 
-methylRawDB.obj <- methRead(
+methylRawDB.obj <- suppressMessages( methRead(
   system.file("extdata", "control1.myCpG.txt", package = "methylKit"),
-  sample.id = "ctrl1", assembly = "hg18")
+  sample.id = "ctrl1", assembly = "hg18", dbtype="tabix") )
 
 test_that("check if methSeg with cores > 1 is the same as cores=1 (tabix file)" ,{
   a=methSeg(methylRawDB.obj,diagnostic.plot = FALSE)

From aff6561b0f1a2f565526b7098f5dcfb68e3bba6c Mon Sep 17 00:00:00 2001
From: katwre <katarzyna.wreczycka@mdc-berlin.de>
Date: Mon, 2 Jul 2018 12:13:14 +0200
Subject: [PATCH 11/13] Add more comments

---
 R/methSeg.R | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index ebacbe58..1f6101c8 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -1,5 +1,8 @@
 # segmentation functions 
 
+#' # Segment methylation calls. 
+#' Run fastseg GRanges object `obj`
+#' with additional parameters in `...`.
 .run.fastseg = function(obj, ...){
   
   dots <- list(...)
@@ -35,7 +38,12 @@
   return(seg.res)
 }
 
-
+#' # Use mixture modeling for the density function estimated 
+#' # and BIC criterion used to decide the optimum number of components
+#' # in mixture modeling.
+#' Run mclust::densityMclust using GRanges object `seg.res`, look for description
+#' of `join.neighbours`, `initialize.on.subset` and other parameters
+#' in methylKit methSeg.
 .run.mclust = function(seg.res, diagnostic.plot=TRUE, join.neighbours=FALSE,
                        initialize.on.subset=1, ...){
   
@@ -197,7 +205,7 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
                   initialize.on.subset=1,
                   mc.cores=1, ...){
   
-  # 1. Run segmentation
+  # 1. Run segmentation using fastseg
   
   if(mc.cores==1){
     
@@ -252,6 +260,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       # Split the input object based on chromosome
       # run methSeg on chromosomes (Could be in parallel or not)
       seg.res.list <- mclapply(chrs, function(chr){
+        
+        # Run segmentation
         seg.res = .run.fastseg(obj[seqnames(obj)==chr])
       }, mc.cores=mc.cores)
       
@@ -264,8 +274,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       
       .run.fastseg.tabix = function(gr0, class0, ...){
         
-        # adjust colnames of input GRanges to 
-        # methylKit naming convention
+        # rename names of meta columns of input GRanges `gr0` to 
+        # methylKit naming convention (such as e.g. coverage, numCs, numTs)
         df2getcolnames = as.data.frame(gr0[1])
         df2getcolnames$width = NULL 
         print(class0)
@@ -275,9 +285,11 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
         # destrand
         strand(gr0) <- "*"
         
+        # Run segmentation
         .run.fastseg(gr0, ...)
       }
       
+      # Run segmentation for each chromosome
       seg.res <- applyTbxByChr(obj@dbpath,
                                return.type = "GRanges",
                                FUN = .run.fastseg.tabix,
@@ -287,7 +299,8 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
     }
   }
   
-  # 2. Density Estimation via Model-Based Clustering
+  # 2. Density Estimation via Model-Based Clustering using
+  #     mclust::densityMclust function
   
   if( length(seg.res) > 1 ){
     seg.res = .run.mclust(seg.res,

From f8020c0bb942e432014c1b00b2df8fcd43337374 Mon Sep 17 00:00:00 2001
From: katwre <katarzyna.wreczycka@mdc-berlin.de>
Date: Tue, 3 Jul 2018 10:12:27 +0200
Subject: [PATCH 12/13] don't subsample segments after joining neighbouring
 segments

---
 R/methSeg.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/methSeg.R b/R/methSeg.R
index 1f6101c8..d8eb92bb 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -58,6 +58,7 @@
     diagnostic.plot = FALSE
   }
   
+  # Run subsampling of segments before performing the mixture modeling
   if("initialization" %in% names(args.Mclust)){
     if("subset" %in% names(args.Mclust[["initialization"]])) {
       if(length(args.Mclust[["initialization"]][["subset"]]) < 9 ){
@@ -105,6 +106,9 @@
     args.Mclust[["diagnostic.plot"]]=diagnostic.plot
     # skip second progress bar
     args.Mclust[["verbose"]]=FALSE
+    # do not run subsampling of segments
+    args.Mclust[["initialization"]]<-NULL
+    
     dens=do.call("densityFind", args.Mclust  )
     
   }

From f747c677ffc83e2ad4e41dd9157bca4e78fe2b20 Mon Sep 17 00:00:00 2001
From: katwre <katarzyna.wreczycka@mdc-berlin.de>
Date: Sun, 15 Jul 2018 21:42:54 +0200
Subject: [PATCH 13/13] keep only seqlevels that are non-empty

---
 R/methSeg.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/methSeg.R b/R/methSeg.R
index d8eb92bb..687ad47b 100644
--- a/R/methSeg.R
+++ b/R/methSeg.R
@@ -259,7 +259,9 @@ methSeg<-function(obj, diagnostic.plot=TRUE, join.neighbours=FALSE,
       }
       # destrand
       strand(obj) <- "*"
-      
+
+      # keep only seqlevels that are non-empty
+      seqlevels(obj) <- seqlevelsInUse(obj)
       chrs = seqlevels(obj)
       # Split the input object based on chromosome
       # run methSeg on chromosomes (Could be in parallel or not)