utils/access_genbank.py at main · GenoRobotics-EPFL/utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from Bio import SeqIO, Entrez
import pandas as pd
import os.path as ospath

Entrez.email = 'contact@genorobotics.org'

#-------------------------------
#extract information from ncbi
#-------------------------------

def build_search_term(gene_name, min_len=0, max_len=-1):
    """ generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name
    so a search query keywords must be used instead.
    Used to make database queries in extract_database and download_database

    Parameters
    ----------
        gene_name: (str) name of the gene of interest
        min_len: (int, optional) lower bound of the sequence length filter, by default 0
        max_len: (int, optional) upper bound of the sequence length filter, by default -1 means no limit

    Returns
    ----------
        term: (str) search query
    """
    term = f"{gene_name}[Gene Name]"
    if max_len == -1:
        term += f" NOT 0:{min_len}[Sequence Length]"
    else:
        term += f" AND {min_len}:{max_len}[Sequence Length]"
    return term


def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1):
    '''Request data on GenBank from NCBI (with Entrez module), with filtering options

    Parameters
    ----------
        gene_name: (str) name of the gene to extract from GenBank
        seqlen_start: (int, optional) lower bound of the sequence length filter, by default 0
        seqlen_stop: (int, optional) upper bound of the sequence length filter, by default -1 means no limit

    Returns
    ----------
        seq_record: (str) raw data of all sequences
    '''
    handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop), retmax = 10000)
    record = Entrez.read(handle)
    matching_requests = record["IdList"]
    request_counter = 0
    while len(record["IdList"]) == 10000:
        request_counter += 1
        handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop)
                                , retstart = request_counter*10000,  retmax = 10000)
        record = Entrez.read(handle)
        matching_requests += record["IdList"]
    seq_handle = Entrez.efetch(db="nucleotide", id=matching_requests, retmode = "fasta", rettype = "fasta")
    seq_record = seq_handle.read()
    return seq_record


def download_database(gene_name, seqlen_start=0, seqlen_stop=-1):
    '''Download data to a .fasta file, with filtering options

    Args:
        gene_name: (str) name of the gene to extract from GenBank
        seqlen_start: (int) lower bound of the sequence length filter
        seqlen_stop: (int) upper bound of the sequence length filter

    Returns:
        data_path: (str) path to downloaded data .fasta file
    '''
    data_path = f"{gene_name}[{seqlen_start},{seqlen_stop}].fasta"
    with open(data_path, mode="w") as file:
        file.write(extract_database(gene_name, seqlen_start, seqlen_stop))
    return data_path

def download_sequence(species, gene_name, dst, start_length=None, stop_length= None, id = None, permissive_search = True):
    """
    download sequence from GenBank through the Entrez database.

    Parameters:
    ----------
    species(str): name of species
    gene_name(str): name of gene
    dst(str,Path-like): destination file path
    start_length(int): minimum length of sequence
    stop_length(int): maximum length of sequence
    id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species.
    permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query.
    """

    if id == None:
        search_term = f"{gene_name}[Gene Name] AND {species}[Organism]"
        if start_length!= None or stop_length!= None:
                search_term += f" {start_length}:{stop_length}[Sequence Length]"
        handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc")
        search_result = Entrez.read(handle)
        handle.close()
        id = search_result["IdList"]
        n=0
        for i in id:
            n+=1
        if n==0 and permissive_search:
            search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]"
            handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc")
            search_result = Entrez.read(handle)
            handle.close()
            id = search_result["IdList"]
    n=0
    for i in id:
        n+=1
    if n==1:
        handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta")
        sequence = handle.read()
        handle.close()
        with open(dst, mode="a") as writer:
            writer.write(sequence)