From 10fb2e508f567e246ed7fef1a55b3ff45a2332aa Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Thu, 26 Mar 2020 13:23:39 -0400 Subject: [PATCH 1/6] Issue warning for possible duplicate analysis files Previous runs of the pipeline may leave old analysis files in place. This can cause duplicate tracks to be constructed. --- dnase/trackhub/samplesforTrackhub.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index 9d29236c..16fa3d7e 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -189,6 +189,18 @@ for(curdir in mappeddirs) { next # Nothing follows here except for the long 'for(analysisFile...' loop. } + # Check for duplicate analysisFiles, possibly left over from previous runs. + AnalysisStrings <- strsplit(analysisFiles,".",fixed=TRUE) + # This gets rid of the trailing ".o########" + PossibleDups <- lapply(AnalysisStrings, function(AnalysisString) { paste(AnalysisString[2], AnalysisString[3], sep=".") }) + FoundDups <- analysisFiles[duplicated(PossibleDups)] + if(length(FoundDups) > 0) { + message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) + for(FoundDup in FoundDups) { + message("[samplesforTrackhub] ", "WARNING See possible dup file: ", FoundDup) + } + } + #make_tracks.bash calls this R script only once when args.project equals "byFC", and only once when it equals "CEGS_byLocus". Each of the directories traversed may contain a unique sampleannotation.txt file so we need to look for it here rather than in make_tracks.bash if(opt$project %in% c("byFC", "CEGS_byLocus") & is.null(opt$inputfile)) { inputfile <- paste0(pwd, '/', dirname(curdir), "/sampleannotation.txt") From 553bf1493b80996d4c6a9b0ed8fffd1f5f48b757 Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Thu, 26 Mar 2020 18:11:37 -0400 Subject: [PATCH 2/6] Check for dups in the "data" matrix, rather than in filenames. --- dnase/trackhub/samplesforTrackhub.R | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index 16fa3d7e..9e538f9d 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -171,6 +171,7 @@ if(opt$project == "CEGS_byLocus") { outputCols <- c(outputCols, "Study", "Project", "Assembly", "Type") } +data_keys <- vector() data <- data.frame(matrix(ncol=length(outputCols), nrow=1)) colnames(data) <- outputCols i <- 0 # This will be our "data" output variable index. @@ -189,18 +190,6 @@ for(curdir in mappeddirs) { next # Nothing follows here except for the long 'for(analysisFile...' loop. } - # Check for duplicate analysisFiles, possibly left over from previous runs. - AnalysisStrings <- strsplit(analysisFiles,".",fixed=TRUE) - # This gets rid of the trailing ".o########" - PossibleDups <- lapply(AnalysisStrings, function(AnalysisString) { paste(AnalysisString[2], AnalysisString[3], sep=".") }) - FoundDups <- analysisFiles[duplicated(PossibleDups)] - if(length(FoundDups) > 0) { - message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) - for(FoundDup in FoundDups) { - message("[samplesforTrackhub] ", "WARNING See possible dup file: ", FoundDup) - } - } - #make_tracks.bash calls this R script only once when args.project equals "byFC", and only once when it equals "CEGS_byLocus". Each of the directories traversed may contain a unique sampleannotation.txt file so we need to look for it here rather than in make_tracks.bash if(opt$project %in% c("byFC", "CEGS_byLocus") & is.null(opt$inputfile)) { inputfile <- paste0(pwd, '/', dirname(curdir), "/sampleannotation.txt") @@ -210,7 +199,7 @@ for(curdir in mappeddirs) { inputSampleIDs <- NULL } } - + for(analysisFile in analysisFiles) { analysisFileContents <- readLines(paste0(pwd, '/', curdir, '/', analysisFile), n=2000) @@ -433,6 +422,19 @@ for(curdir in mappeddirs) { } data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse=".")) + + # Check for duplicate analysisFiles, possibly left over from a previous run. + # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. + fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] + data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="") + if(i != 1) { + if(data_key %in% data_keys){ + message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) + msg <- paste("See: SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ") + message("[samplesforTrackhub] ", msg) + } + } + data_keys[i] <- data_key } } From b2a9162eb0a4ad267bfa45a49ccff188db2ce1d3 Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Thu, 26 Mar 2020 18:18:23 -0400 Subject: [PATCH 3/6] Fixed some tabs --- dnase/trackhub/samplesforTrackhub.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index 9e538f9d..53d87d81 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -199,7 +199,7 @@ for(curdir in mappeddirs) { inputSampleIDs <- NULL } } - + for(analysisFile in analysisFiles) { analysisFileContents <- readLines(paste0(pwd, '/', curdir, '/', analysisFile), n=2000) @@ -422,9 +422,9 @@ for(curdir in mappeddirs) { } data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse=".")) - - # Check for duplicate analysisFiles, possibly left over from a previous run. - # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. + + # Check for duplicate analysisFiles, possibly left over from a previous run. + # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="") if(i != 1) { From 3bf3344ef1fe6f8118b40093e0a9b2ba80c1eb8e Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Mon, 30 Mar 2020 14:52:34 -0400 Subject: [PATCH 4/6] Changed if from != to > --- dnase/trackhub/samplesforTrackhub.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index 53d87d81..08c7c00f 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -427,7 +427,7 @@ for(curdir in mappeddirs) { # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="") - if(i != 1) { + if(i > 1) { if(data_key %in% data_keys){ message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) msg <- paste("See: SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ") From c2fa72b0ff1e5c7de109634633cc4d56583e5e24 Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Tue, 31 Mar 2020 17:37:48 -0400 Subject: [PATCH 5/6] Added flowcell ID to Group when processing tracks byLocus. However, I think this makes the byLocus browser view to no longer work as intended. Take a look at byLocus in dev, and we can decide what to do. --- dnase/trackhub/samplesforTrackhub.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index 08c7c00f..a4eb28ab 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -327,7 +327,7 @@ for(curdir in mappeddirs) { data$Group[i] <- paste0(flowcell_dates[[curFC]] , "_" , data$Group[i]) } } else if(opt$project=="CEGS_byLocus") { - #Group values will be in the form of Study ID + #Group values will be in the form of [Study ID]_[FC ID] if(is.na(data$Genetic_Modification[i])) { #Based on sample name @@ -368,7 +368,8 @@ for(curdir in mappeddirs) { } } data$Type[i] <- CEGSsampleType - data$Group[i] <- data$Study[i] + fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] + data$Group[i] <- paste0(data$Study[i], "_", fcID) } } else { stop("ERROR Impossible!") @@ -424,13 +425,12 @@ for(curdir in mappeddirs) { data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse=".")) # Check for duplicate analysisFiles, possibly left over from a previous run. - # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. - fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] - data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="") - if(i > 1) { + # Note: When project==CEGS_byLocus, items with Group=NA are deleted near the end of this script. + data_key <- paste0(data$SampleID[i], data$Mapped_Genome[i], data$Group[i] ) + if( (!is.na(data$Group[i])) && (i > 1) ) { if(data_key %in% data_keys){ message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) - msg <- paste("See: SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ") + msg <- paste0("See: SampleID: ", data$SampleID[i], " Mapped_Genome: ", data$Mapped_Genome[i], " Group: ", data$Group[i]) message("[samplesforTrackhub] ", msg) } } From 3fb0e3fdca486b72f90a6bcd6eed2e258eda4fb6 Mon Sep 17 00:00:00 2001 From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com> Date: Mon, 6 Apr 2020 18:00:30 -0400 Subject: [PATCH 6/6] Add a FlowCellID column to samplesforTrackhub.R output Is used to make unique tracknames for ByLocus tracks. --- dnase/trackhub/MakeTrackhub.py | 11 +++++++---- dnase/trackhub/samplesforTrackhub.R | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/dnase/trackhub/MakeTrackhub.py b/dnase/trackhub/MakeTrackhub.py index 6af24685..734c5f3b 100755 --- a/dnase/trackhub/MakeTrackhub.py +++ b/dnase/trackhub/MakeTrackhub.py @@ -312,9 +312,12 @@ def shortest_unique_strings(array, minlength=1): # So sampleName_trackname is increased by 4+8+9=21 characters. # If sampleName_trackname starts with 107 characters, then 128 characters get sent to the server. This causes an error. # So sampleName_trackname needs to be 106 characters or less. - sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['SampleID']) - - + if args.supertrack == "By_Locus": + # curGroup does not contain the flowcell ID here. + sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['FlowCellID'] + "_" + curSample['SampleID']) + else: + sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['SampleID']) + # Make sure there are no duplicate track names. if sampleName_trackname in sampleName_dict: if args.verbose: @@ -411,7 +414,7 @@ def shortest_unique_strings(array, minlength=1): visibility="full", parentonoff=DensCovTracksDefaultDisplayMode, tracktype="bigWig", - viewLimits="0:500", + viewLimits="0:500", #Keep high since it becomes a hard limit in the UI autoScale='on', alwaysZero='on', maxHeightPixels="100:30:10", diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R index a4eb28ab..50490296 100755 --- a/dnase/trackhub/samplesforTrackhub.R +++ b/dnase/trackhub/samplesforTrackhub.R @@ -166,7 +166,7 @@ colorAssignments <- NULL # Initialize "data" with just column names. We'll be adding rows to this later on in the code. -outputCols <- c("Name", "SampleID", "Assay", "Group", "filebase", "Mapped_Genome", "Annotation_Genome", "Color", "analyzed_reads", "Genomic_coverage", "SPOT", "Num_hotspots", "Exclude", "Age", "Institution", "Replicate", "Bait_set", "Genetic_Modification") +outputCols <- c("Name", "SampleID", "Assay", "Group", "filebase", "Mapped_Genome", "Annotation_Genome", "Color", "analyzed_reads", "Genomic_coverage", "SPOT", "Num_hotspots", "Exclude", "Age", "Institution", "Replicate", "Bait_set", "Genetic_Modification", "FlowCellID") if(opt$project == "CEGS_byLocus") { outputCols <- c(outputCols, "Study", "Project", "Assembly", "Type") } @@ -327,7 +327,7 @@ for(curdir in mappeddirs) { data$Group[i] <- paste0(flowcell_dates[[curFC]] , "_" , data$Group[i]) } } else if(opt$project=="CEGS_byLocus") { - #Group values will be in the form of [Study ID]_[FC ID] + #Group values will be in the form of Study ID if(is.na(data$Genetic_Modification[i])) { #Based on sample name @@ -368,8 +368,7 @@ for(curdir in mappeddirs) { } } data$Type[i] <- CEGSsampleType - fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] - data$Group[i] <- paste0(data$Study[i], "_", fcID) + data$Group[i] <- data$Study[i] } } else { stop("ERROR Impossible!") @@ -425,13 +424,15 @@ for(curdir in mappeddirs) { data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse=".")) # Check for duplicate analysisFiles, possibly left over from a previous run. - # Note: When project==CEGS_byLocus, items with Group=NA are deleted near the end of this script. - data_key <- paste0(data$SampleID[i], data$Mapped_Genome[i], data$Group[i] ) - if( (!is.na(data$Group[i])) && (i > 1) ) { + # Note: "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir. + # FlowCellID will also be used in MakeTrackhub.py to create unique byLocus tracknames. + data$FlowCellID[i] <- strsplit(curdir, "/", fixed=TRUE)[[1]][1] + data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], data$FlowCellID[i], sep="") + if(i > 1) { if(data_key %in% data_keys){ - message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) - msg <- paste0("See: SampleID: ", data$SampleID[i], " Mapped_Genome: ", data$Mapped_Genome[i], " Group: ", data$Group[i]) - message("[samplesforTrackhub] ", msg) + message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir) + msg <- paste("See: SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "FlowCellID:", data$FlowCellID[i], sep=" ") + message("[samplesforTrackhub] ", msg) } } data_keys[i] <- data_key