From 10fb2e508f567e246ed7fef1a55b3ff45a2332aa Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Thu, 26 Mar 2020 13:23:39 -0400
Subject: [PATCH 1/6] Issue warning for possible duplicate analysis files

Previous runs of the pipeline may leave old analysis files in place.
This can cause duplicate tracks to be constructed.
---
 dnase/trackhub/samplesforTrackhub.R | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index 9d29236c..16fa3d7e 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -189,6 +189,18 @@ for(curdir in mappeddirs) {
 		next # Nothing follows here except for the long 'for(analysisFile...' loop.
 	}
 	
+    # Check for duplicate analysisFiles, possibly left over from previous runs.
+	AnalysisStrings <- strsplit(analysisFiles,".",fixed=TRUE)
+    # This gets rid of the trailing ".o########"
+	PossibleDups <- lapply(AnalysisStrings, function(AnalysisString) { paste(AnalysisString[2], AnalysisString[3], sep=".") })
+	FoundDups <- analysisFiles[duplicated(PossibleDups)]
+	if(length(FoundDups) > 0) {
+		message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
+		for(FoundDup in FoundDups) {
+			message("[samplesforTrackhub] ", "WARNING See possible dup file: ", FoundDup)
+		}
+	}
+	
 	#make_tracks.bash calls this R script only once when args.project equals "byFC", and only once when it equals "CEGS_byLocus". Each of the directories traversed may contain a unique sampleannotation.txt file so we need to look for it here rather than in make_tracks.bash
 	if(opt$project %in% c("byFC", "CEGS_byLocus") & is.null(opt$inputfile)) {
 		inputfile <- paste0(pwd, '/', dirname(curdir), "/sampleannotation.txt")

From 553bf1493b80996d4c6a9b0ed8fffd1f5f48b757 Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Thu, 26 Mar 2020 18:11:37 -0400
Subject: [PATCH 2/6] Check for dups in the "data" matrix, rather than in
 filenames.

---
 dnase/trackhub/samplesforTrackhub.R | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index 16fa3d7e..9e538f9d 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -171,6 +171,7 @@ if(opt$project == "CEGS_byLocus") {
     outputCols <- c(outputCols, "Study", "Project", "Assembly", "Type")
 }
 
+data_keys <- vector()
 data <- data.frame(matrix(ncol=length(outputCols), nrow=1))
 colnames(data) <- outputCols
 i <- 0 # This will be our "data" output variable index.
@@ -189,18 +190,6 @@ for(curdir in mappeddirs) {
 		next # Nothing follows here except for the long 'for(analysisFile...' loop.
 	}
 	
-    # Check for duplicate analysisFiles, possibly left over from previous runs.
-	AnalysisStrings <- strsplit(analysisFiles,".",fixed=TRUE)
-    # This gets rid of the trailing ".o########"
-	PossibleDups <- lapply(AnalysisStrings, function(AnalysisString) { paste(AnalysisString[2], AnalysisString[3], sep=".") })
-	FoundDups <- analysisFiles[duplicated(PossibleDups)]
-	if(length(FoundDups) > 0) {
-		message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
-		for(FoundDup in FoundDups) {
-			message("[samplesforTrackhub] ", "WARNING See possible dup file: ", FoundDup)
-		}
-	}
-	
 	#make_tracks.bash calls this R script only once when args.project equals "byFC", and only once when it equals "CEGS_byLocus". Each of the directories traversed may contain a unique sampleannotation.txt file so we need to look for it here rather than in make_tracks.bash
 	if(opt$project %in% c("byFC", "CEGS_byLocus") & is.null(opt$inputfile)) {
 		inputfile <- paste0(pwd, '/', dirname(curdir), "/sampleannotation.txt")
@@ -210,7 +199,7 @@ for(curdir in mappeddirs) {
 			inputSampleIDs <- NULL
 		}
 	}
-	
+    
 	for(analysisFile in analysisFiles) {
 		analysisFileContents <- readLines(paste0(pwd, '/', curdir, '/', analysisFile), n=2000)
 		
@@ -433,6 +422,19 @@ for(curdir in mappeddirs) {
 		}
 		
 		data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse="."))
+
+    	# Check for duplicate analysisFiles, possibly left over from a previous run.
+        # Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
+		fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
+		data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="")
+		if(i != 1) {
+			if(data_key %in% data_keys){
+				message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
+				msg <- paste("See:    SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ")
+				message("[samplesforTrackhub] ", msg)
+			}
+		}
+		data_keys[i] <- data_key
 	}
 }
 

From b2a9162eb0a4ad267bfa45a49ccff188db2ce1d3 Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Thu, 26 Mar 2020 18:18:23 -0400
Subject: [PATCH 3/6] Fixed some tabs

---
 dnase/trackhub/samplesforTrackhub.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index 9e538f9d..53d87d81 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -199,7 +199,7 @@ for(curdir in mappeddirs) {
 			inputSampleIDs <- NULL
 		}
 	}
-    
+	
 	for(analysisFile in analysisFiles) {
 		analysisFileContents <- readLines(paste0(pwd, '/', curdir, '/', analysisFile), n=2000)
 		
@@ -422,9 +422,9 @@ for(curdir in mappeddirs) {
 		}
 		
 		data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse="."))
-
-    	# Check for duplicate analysisFiles, possibly left over from a previous run.
-        # Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
+		
+		# Check for duplicate analysisFiles, possibly left over from a previous run.
+		# Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
 		fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
 		data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="")
 		if(i != 1) {

From 3bf3344ef1fe6f8118b40093e0a9b2ba80c1eb8e Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Mon, 30 Mar 2020 14:52:34 -0400
Subject: [PATCH 4/6] Changed if from != to >

---
 dnase/trackhub/samplesforTrackhub.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index 53d87d81..08c7c00f 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -427,7 +427,7 @@ for(curdir in mappeddirs) {
 		# Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
 		fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
 		data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="")
-		if(i != 1) {
+		if(i > 1) {
 			if(data_key %in% data_keys){
 				message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
 				msg <- paste("See:    SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ")

From c2fa72b0ff1e5c7de109634633cc4d56583e5e24 Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Tue, 31 Mar 2020 17:37:48 -0400
Subject: [PATCH 5/6] Added flowcell ID to Group when processing tracks
 byLocus.

However, I think this makes the byLocus browser view to no longer work as intended.

Take a look at byLocus in dev, and we can decide what to do.
---
 dnase/trackhub/samplesforTrackhub.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index 08c7c00f..a4eb28ab 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -327,7 +327,7 @@ for(curdir in mappeddirs) {
 					data$Group[i] <- paste0(flowcell_dates[[curFC]] , "_" , data$Group[i])
 				}
 			} else if(opt$project=="CEGS_byLocus") {
-				#Group values will be in the form of Study ID
+				#Group values will be in the form of [Study ID]_[FC ID]
 				
 				if(is.na(data$Genetic_Modification[i])) {
 					#Based on sample name
@@ -368,7 +368,8 @@ for(curdir in mappeddirs) {
 						}
 					}
 					data$Type[i] <- CEGSsampleType
-					data$Group[i] <- data$Study[i]
+					fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
+					data$Group[i] <- paste0(data$Study[i], "_", fcID)
 				}
 			} else {
 				stop("ERROR Impossible!")
@@ -424,13 +425,12 @@ for(curdir in mappeddirs) {
 		data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse="."))
 		
 		# Check for duplicate analysisFiles, possibly left over from a previous run.
-		# Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
-		fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
-		data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], fcID, sep="")
-		if(i > 1) {
+		# Note: When project==CEGS_byLocus, items with Group=NA are deleted near the end of this script.
+		data_key <- paste0(data$SampleID[i], data$Mapped_Genome[i], data$Group[i] )
+		if( (!is.na(data$Group[i])) && (i > 1) ) {
 			if(data_key %in% data_keys){
 				message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
-				msg <- paste("See:    SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "fcID:", fcID, sep=" ")
+				msg <- paste0("See:    SampleID: ", data$SampleID[i], " Mapped_Genome: ", data$Mapped_Genome[i], " Group: ", data$Group[i])
 				message("[samplesforTrackhub] ", msg)
 			}
 		}

From 3fb0e3fdca486b72f90a6bcd6eed2e258eda4fb6 Mon Sep 17 00:00:00 2001
From: John Cadley <45205158+cadley-nyulangone@users.noreply.github.com>
Date: Mon, 6 Apr 2020 18:00:30 -0400
Subject: [PATCH 6/6] Add a FlowCellID column to samplesforTrackhub.R output

Is used to make unique tracknames for ByLocus tracks.
---
 dnase/trackhub/MakeTrackhub.py      | 11 +++++++----
 dnase/trackhub/samplesforTrackhub.R | 21 +++++++++++----------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/dnase/trackhub/MakeTrackhub.py b/dnase/trackhub/MakeTrackhub.py
index 6af24685..734c5f3b 100755
--- a/dnase/trackhub/MakeTrackhub.py
+++ b/dnase/trackhub/MakeTrackhub.py
@@ -312,9 +312,12 @@ def shortest_unique_strings(array, minlength=1):
             # So sampleName_trackname is increased by 4+8+9=21 characters.
             # If sampleName_trackname starts with 107 characters, then 128 characters get sent to the server.  This causes an error.
             # So sampleName_trackname needs to be 106 characters or less. 
-            sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['SampleID'])
-            
-            
+            if args.supertrack == "By_Locus":
+                # curGroup does not contain the flowcell ID here.
+                sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['FlowCellID'] + "_" + curSample['SampleID'])
+            else:
+                sampleName_trackname = cleanTrackName(sampleNameGenome + "_" + curGroup + "_" + curSample['SampleID'])
+
             # Make sure there are no duplicate track names.
             if sampleName_trackname in sampleName_dict:
                 if args.verbose:
@@ -411,7 +414,7 @@ def shortest_unique_strings(array, minlength=1):
                             visibility="full",
                             parentonoff=DensCovTracksDefaultDisplayMode,
                             tracktype="bigWig",
-                            viewLimits="0:500",
+                            viewLimits="0:500", #Keep high since it becomes a hard limit in the UI
                             autoScale='on',
                             alwaysZero='on',
                             maxHeightPixels="100:30:10",
diff --git a/dnase/trackhub/samplesforTrackhub.R b/dnase/trackhub/samplesforTrackhub.R
index a4eb28ab..50490296 100755
--- a/dnase/trackhub/samplesforTrackhub.R
+++ b/dnase/trackhub/samplesforTrackhub.R
@@ -166,7 +166,7 @@ colorAssignments <- NULL
 
 
 # Initialize "data" with just column names.  We'll be adding rows to this later on in the code.
-outputCols <- c("Name", "SampleID", "Assay", "Group", "filebase", "Mapped_Genome", "Annotation_Genome", "Color", "analyzed_reads", "Genomic_coverage", "SPOT", "Num_hotspots", "Exclude", "Age", "Institution", "Replicate", "Bait_set", "Genetic_Modification")
+outputCols <- c("Name", "SampleID", "Assay", "Group", "filebase", "Mapped_Genome", "Annotation_Genome", "Color", "analyzed_reads", "Genomic_coverage", "SPOT", "Num_hotspots", "Exclude", "Age", "Institution", "Replicate", "Bait_set", "Genetic_Modification", "FlowCellID")
 if(opt$project == "CEGS_byLocus") {
     outputCols <- c(outputCols, "Study", "Project", "Assembly", "Type")
 }
@@ -327,7 +327,7 @@ for(curdir in mappeddirs) {
 					data$Group[i] <- paste0(flowcell_dates[[curFC]] , "_" , data$Group[i])
 				}
 			} else if(opt$project=="CEGS_byLocus") {
-				#Group values will be in the form of [Study ID]_[FC ID]
+				#Group values will be in the form of Study ID
 				
 				if(is.na(data$Genetic_Modification[i])) {
 					#Based on sample name
@@ -368,8 +368,7 @@ for(curdir in mappeddirs) {
 						}
 					}
 					data$Type[i] <- CEGSsampleType
-					fcID <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
-					data$Group[i] <- paste0(data$Study[i], "_", fcID)
+					data$Group[i] <- data$Study[i]
 				}
 			} else {
 				stop("ERROR Impossible!")
@@ -425,13 +424,15 @@ for(curdir in mappeddirs) {
 		data$filebase[i] <- paste0(curdir, "/", paste0(unlist(strsplit(basename(analysisFile), "\\."))[2:3], collapse="."))
 		
 		# Check for duplicate analysisFiles, possibly left over from a previous run.
-		# Note: When project==CEGS_byLocus, items with Group=NA are deleted near the end of this script.
-		data_key <- paste0(data$SampleID[i], data$Mapped_Genome[i], data$Group[i] )
-		if( (!is.na(data$Group[i])) && (i > 1) ) {
+		# Note:  "data$Group" does not contain flowcell ID info when opt$project=CEGS_byLocus, so we need to get it from curdir.
+		#         FlowCellID will also be used in MakeTrackhub.py to create unique byLocus tracknames.
+		data$FlowCellID[i] <- strsplit(curdir, "/", fixed=TRUE)[[1]][1]
+		data_key <- paste(data$SampleID[i], data$Mapped_Genome[i], data$FlowCellID[i], sep="")
+		if(i > 1) {
 			if(data_key %in% data_keys){
-				message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
-				msg <- paste0("See:    SampleID: ", data$SampleID[i], " Mapped_Genome: ", data$Mapped_Genome[i], " Group: ", data$Group[i])
-				message("[samplesforTrackhub] ", msg)
+			message("[samplesforTrackhub] ", "WARNING Possible duplicate analysis files found in ", curdir)
+			msg <- paste("See:    SampleID:", data$SampleID[i], "Mapped_Genome:", data$Mapped_Genome[i], "FlowCellID:", data$FlowCellID[i], sep=" ")
+			message("[samplesforTrackhub] ", msg)
 			}
 		}
 		data_keys[i] <- data_key