From 41ad5c3c6240507c0f54781dc47b571b3f295c36 Mon Sep 17 00:00:00 2001 From: Brian White Date: Tue, 19 Jun 2018 15:24:00 -0700 Subject: [PATCH] Generalized/fixed parsing of SRA ids from GEO metadata file; added output-file option; added more debugging info --- bin/get-geo-annotations.R | 49 ++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/bin/get-geo-annotations.R b/bin/get-geo-annotations.R index c0bd936..cffb306 100755 --- a/bin/get-geo-annotations.R +++ b/bin/get-geo-annotations.R @@ -15,7 +15,10 @@ suppressPackageStartupMessages(usePackage("optparse")) option_list <- list( make_option(c("--gse"), action="store", default=NULL, - help="GEO Series accession number (e.g., \"GSE89777\")")) + help="GEO Series accession number (e.g., \"GSE89777\")"), + make_option(c("--output-file"), action="store", + default="metadata.tsv", + help="File to store the metadata in TSV format [default: %default%")) descr <- "\ Extract GEO annotations from the GEO data set with identifier specified by 'GSE'. If the data set has SRA entries, the ftp directory is expected to be specified in GEO annotation fields that include the pattern 'supplementary_file'. This field will be parsed to extract the SRA identifier, which will be used to determine the URL of the FTP file. Whether an SRA data set or not, the URL will be included in the 'url' column of the output. @@ -28,6 +31,7 @@ opt <- arguments$options ## Collect input parameters gse.identifier <- opt$gse +output.file <- opt$`output-file` if ( length(arguments$args) != 0 ) { print_help(parser) @@ -75,28 +79,55 @@ if(length(supp.file.columns) > 1) { metadata.tbl$url <- as.character(metadata.tbl[, supp.file.columns[1]]) +cat("Outputting metadata file before resolving URLs to FTP sites\n") +write.table(file = output.file, metadata.tbl, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) + if(any(metadata.tbl$type == "SRA")) { sra.db.dest.file <- "SRAmetadb.sqlite" if(!file.exists(sra.db.dest.file)) { - sra.db.dest.file <- getSRAdbFile(destfile = paste0(sra.db.dest.file, ".gz")) + url <- "https://starbuck1.s3.amazonaws.com/sradb/SRAmetadb.sqlite.gz" + gzip.file <- paste0(sra.db.dest.file, ".gz") + download.file(url = url, destfile = gzip.file, mode = "wb", method = "libcurl") + system(paste0("gunzip ", gzip.file)) + ## sra.db.dest.file <- getSRAdbFile(destfile = paste0(sra.db.dest.file, ".gz")) } con <- dbConnect(RSQLite::SQLite(), sra.db.dest.file) + + if(!("relation" %in% colnames(metadata.tbl))) { + stop("Was expecting to parse SRA identifier from relation column, but no such column exists\n") + } for(i in 1:nrow(metadata.tbl)) { if(metadata.tbl$type[i] == "SRA") { - ## Extract the SRA identifier - ## NB: this is probably also in the relation column - url <- metadata.tbl$url[i] - sra.identifier <- gsub("^.*\\/([^\\/]+)$", "\\1", url) - url <- listSRAfile(sra.identifier, con)$ftp - metadata.tbl$url[i] <- url + ## Extract the SRA identifier + ## NB: this is probably also in the relation column + ## We once tried to extract this from the URL, but it was too + ## simplistic. + ## url <- metadata.tbl$url[i] + ## sra.identifier <- gsub("^.*\\/([^\\/]+)$", "\\1", url) + ## Instead, extract from the relation field + relation.str <- as.character(metadata.tbl$relation[i]) + relations <- unlist(strsplit(x = relation.str, split=";[ ]*")) + flag <- grepl(relations, pattern="SRA") + if(!(length(which(flag) == 1))) { + stop(paste0("Could not find SRA in relation string: ", relation.str, "\n")) + } + sra.relation <- relations[flag] + if(!grepl(sra.relation, pattern = "term=")) { + stop(paste0("Could not parse term= in relation string: ", sra.relation, "\n")) + } + sra.identifier <- gsub("^.*term=([^\\/]+)$", "\\1", sra.relation) + cat(paste0("Parsed SRA identifier ", sra.identifier, " from ", relation.str, "\n")) + url <- listSRAfile(sra.identifier, con)$ftp + metadata.tbl$url[i] <- url } } d <- dbDisconnect(con) } -write.table(metadata.tbl, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) +cat("Outputting metadata file after resolving URLs to FTP sites\n") +write.table(file = output.file, metadata.tbl, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) message("Successfully completed\n") q(status = 0)