-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathorthoxml.py
More file actions
329 lines (283 loc) · 14.9 KB
/
orthoxml.py
File metadata and controls
329 lines (283 loc) · 14.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
'''
From http://orthoxml.org/0.3/orthoxml_doc_v0.3.html:
OrthoXML is an XML schema designed to describe orthology relations. Orthologs
are defined as genes in different species deriving from a single gene in the
last common ancestor. This relationship makes them interesting, as they are
likely to have the same function.
OrthoXML is designed to be a versatile format to store orthology data from
different sources in a uniform manner. It can store assignment from both
pairwise approaches and tree based approaches with a variable level of detail.
OrthoXML allows direct comparison and integration of orthology data from
different resources. Additional, resource-specific information can also be
included.
OrthoXML is a XML format. XML is a markup language which embeds the content in
a structured way so that it easy to process and validate. Orthology data can be
structured using XML as a container, where the relationships of genes and their
orthology groups can be described as data objects. Since OrthoXML is defined as
an XML schema, all XML files can be validated and checked to see if they are
well-formed documents.
http://www.orthoxml.org/xml/Main.html
http://www.orthoxml.org/0.3/examples/orthoxml_example_v0.3.xml
http://www.orthoxml.org/0.3/orthoxml.xsd
http://orthoxml.org/0.3/orthoxml_doc_v0.3.html
'''
def test():
import sys
gene = Gene("3", "P1234", protId="Q1341", transcriptId="P1234t")
database = Database("Uniprot", "2011_06", genes=[gene], protLink="http://www.uniprot.org/uniprot/")
species = Species("Mus musculus", "10090", [database], notes=Notes("uniprot mouse rocks!"))
gene2 = Gene("4", "Q1234", protId="R9999")
gene3 = Gene("5", "Q1235", protId="R6678")
database2 = Database("Uniprot", "2011_06", genes=[gene2, gene3], geneLink="http://www.uniprot.org/uniprot/",
protLink="http://www.uniprot.org/uniprot/", transcriptLink="http://www.uniprot.org/uniprot/")
species2 = Species("Homo sapiens", "9606", [database2], notes=Notes("uniprot human rocks!"))
notes2 = Notes("ortho notes")
notes3 = Notes("group notes")
scoredef = ScoreDef("dist", "maximum-likelihood evolutionary distance, from 0.0 to 19.0")
score =Score("dist", "1.039")
geneRef1 = GeneRef("3", notes=Notes("generef 3 is cool"))
geneRef2 = GeneRef("4", scores=[Score("dist", "1.1"), Score("dist", "1.2")])
geneRef3 = GeneRef("5", notes=Notes("generef 3 is cool"))
prop1 = Property("speed", "fast")
prop2 = Property("valuable")
pgroup = ParalogGroup([geneRef2, geneRef3], properties=[prop1, prop2], iden="p1", scores=[score], notes=Notes("good job"))
ogroup = OrthologGroup([pgroup, geneRef1], scores=[score], notes=Notes("good job"))
for x in toOrthoXML("roundup", "2", [species, species2], [ogroup], scoreDefs=[scoredef], notes=Notes("params look ok"), indent=' ', newl='\n'):
sys.stdout.write(x)
class Notes(object):
def __init__(self, notes):
'''
notes: a string describing something
'''
self.notes = notes
def toXml(self, indent, newl, level):
if self.notes:
yield indent*level + '<notes>' + newl
yield indent*(level+1) + self.notes.strip() + newl
yield indent*level + '</notes>' + newl
else:
yield indent*level + '<notes/>' + newl
class Gene(object):
def __init__(self, iden, geneId=None, protId=None, transcriptId=None):
'''
iden: required. the integer (or string of an integer) id used to identify the gene in the orthoxml document.
Unique among other gene ids in the document.
geneId: an external gene identifier, presumably from the database associated with this Gene.
protId: an external protein identifier, presumably from the database associated with this Gene.
transcriptId: an external transcript identifier, presumably from the database associated with this Gene.
At least one of geneId, protId, and transcriptId is required.
'''
self.id = int(iden)
self.geneId = geneId
self.protId = protId
self.transcriptId = transcriptId
def toXml(self, indent, newl, level):
tag = '<gene id="{}"'.format(self.id)
tag += ''.join(' {}="{}"'.format(k, getattr(self, k)) for k in ('geneId', 'protId', 'transcriptId') if getattr(self, k) is not None) + '/>' + newl
yield indent*level + tag
class ScoreDef(object):
def __init__(self, iden, desc):
'''
iden: required. used to identify this score definition in the document.
desc: a description of what type of score this is, e.g. BLAST E-value.
'''
self.id = iden
self.desc = desc
def toXml(self, indent, newl, level):
yield indent*level + '<scoreDef id="{}" desc="{}"/>{}'.format(self.id, self.desc, newl)
class Score(object):
def __init__(self, iden, value):
'''
iden: required. refers to a ScoreDef id, which defines the type of this score.
value: the actual score itself.
'''
self.id = iden
self.value = value
def toXml(self, indent, newl, level):
yield indent*level + '<score id="{}" value="{}"/>{}'.format(self.id, self.value, newl)
class Property(object):
def __init__(self, name, value=None):
'''
key: required.
value: optional, in case the property key is a flag.
From the orthoxml docs: Key-value pair for group annotations, for instance statistics about the group members.
'''
self.name = name
self.value = value
def toXml(self, indent, newl, level):
tag = '<property name="{}"'.format(self.name)
tag += ' value="{}"/>{}'.format(self.value, newl) if self.value is not None else '/>{}'.format(newl)
yield indent*level + tag
class Database(object):
def __init__(self, name, version, genes, geneLink=None, protLink=None, transcriptLink=None):
'''
name: e.g. Ensembl or Uniprot. The name of the database where the genes come from.
version: The version or release these genes are from. e.g. Homo_sapiens.NCBI36.52.pep.all.fa or 2011_06
genes: a seq of at least 1 Gene object. these genes belong to this database in this species.
geneLink: optional url
protLink: optional url
transcriptLink: optioal url
see the orthoxml docs for more about the links.
'''
self.name = name
self.version = version
self.genes = genes
self.geneLink = geneLink
self.protLink = protLink
self.transcriptLink = transcriptLink
def toXml(self, indent, newl, level):
tag = '<database name="{}" version="{}"'.format(self.name, self.version)
tag += ''.join(' {}="{}"'.format(k, getattr(self, k)) for k in ('geneLink', 'protLink', 'transcriptLink') if getattr(self, k) is not None) + '>{}'.format(newl)
yield indent*level + tag
yield indent*(level+1) + '<genes>{}'.format(newl)
for gene in self.genes:
for xml in gene.toXml(indent, newl, level+2):
yield xml
yield indent*(level+1) + '</genes>{}'.format(newl)
yield indent*level + '</database>{}'.format(newl)
class Species(object):
def __init__(self, name, ncbiTaxId, databases, notes=None):
'''
name: e.g. Homo sapiens. An organism name.
ncbiTaxId: an integer, e.g. 9606. The NCBI Taxonomy database id corresponding to this species.
databases: a seq of at least one Database obj.
notes: optional Notes object annotating this species.
'''
self.name = name
self.ncbiTaxId = int(ncbiTaxId)
self.databases = databases
self.notes = notes
def toXml(self, indent, newl, level):
yield indent*level + '<species name="{}" NCBITaxId="{}">{}'.format(self.name, self.ncbiTaxId, newl)
for database in self.databases:
for xml in database.toXml(indent, newl, level+1):
yield xml
if self.notes:
for xml in self.notes.toXml(indent, newl, level+1):
yield xml
yield indent*level + '</species>{}'.format(newl)
class OrthologGroup(object):
def __init__(self, members, iden=None, scores=None, properties=None, notes=None):
'''
iden: identifier for the group of orthologs in the origin of this orthoxml document. optional, but recommened if it is available.
members: a list of 2 or more elements, where each element can be a GeneRef, OrthologGroup, or ParalogGroup
scores: optional. a seq of zero or more Score objs describing this group.
properties: optional. a seq of of 0 or more Property objs describing this group.
notes: an optional Notes object about this group.
Members of an OrthologGroup are related by a speciation event at their most recent point of origin.
'''
self.id = iden
self.scores = scores if scores is not None else []
self.properties = properties if properties is not None else []
self.members = members
self.notes = notes
def toXml(self, indent, newl, level):
tag = '<orthologGroup id="{}"/>{}'.format(self.id, newl) if self.id else '<orthologGroup>{}'.format(newl)
yield indent*level + tag
for score in self.scores:
for xml in score.toXml(indent, newl, level+1):
yield xml
for prop in self.properties:
for xml in prop.toXml(indent, newl, level+1):
yield xml
for member in self.members:
for xml in member.toXml(indent, newl, level+1):
yield xml
if self.notes:
for xml in self.notes.toXml(indent, newl, level+1):
yield xml
yield indent*level + '</orthologGroup>{}'.format(newl)
class ParalogGroup(object):
def __init__(self, members, iden=None, scores=None, properties=None, notes=None):
'''
iden: identifier for the group of orthologs in the origin of this orthoxml document. optional, but recommened if it is available.
members: a list of 2 or more elements, where each element can be a GeneRef, OrthologGroup, or ParalogGroup
scores: optional. a seq of zero or more Score objs describing this group.
properties: optional. a seq of of 0 or more Property objs describing this group.
notes: an optional Notes object about this group.
Members of a ParalogGroup are related by a duplication event at their most recent point of origin.
'''
self.id = iden
self.scores = scores if scores is not None else []
self.properties = properties if properties is not None else []
self.members = members
self.notes = notes
def toXml(self, indent, newl, level):
tag = '<paralogGroup id="{}"/>{}'.format(self.id, newl) if self.id else '<paralogGroup>{}'.format(newl)
yield indent*level + tag
for score in self.scores:
for xml in score.toXml(indent, newl, level+1):
yield xml
for prop in self.properties:
for xml in prop.toXml(indent, newl, level+1):
yield xml
for member in self.members:
for xml in member.toXml(indent, newl, level+1):
yield xml
if self.notes:
for xml in self.notes.toXml(indent, newl, level+1):
yield xml
yield indent*level + '</paralogGroup>{}'.format(newl)
class GeneRef(object):
def __init__(self, iden, scores=None, notes=None):
'''
iden: a Gene id
scores: optional. a seq of zero or more Score objs describing this GeneRef.
notes: optional. Notes object describing this GeneRef.
'''
self.id = int(iden)
self.scores = scores if scores is not None else []
self.notes = notes
def toXml(self, indent, newl, level):
if self.notes or self.scores:
yield indent*level + '<geneRef id="{}">{}'.format(self.id, newl)
for score in self.scores:
for xml in score.toXml(indent, newl, level+1):
yield xml
if self.notes:
for xml in self.notes.toXml(indent, newl, level+1):
yield xml
yield indent*level + '</geneRef>{}'.format(newl)
else:
yield indent*level + '<geneRef id="{}"/>{}'.format(self.id, newl)
def toOrthoXML(origin, originVersion, species, groups, scoreDefs=None, notes=None, version='0.3', indent=' ', newl='\n'):
'''
origin: the source of these ortholog groups. e.g. inparanoid, roundup, omabrowser.
originVersion: the version or release of the source database. e.g 2, 7.0, GRCh37.p5, 2011_06.
species: a iterable of Species objects. These species contain the databases which contain the genes that are referred to within the groups.
Iterable so items can be generated on the fly to avoid memory issues.
groups: an iterable of one or more OrthologGroup objects. Iterable so items can be generated on the fly to avoid memory issues.
scoreDefs: optional. a seq of zero or more ScoreDef objects. These are referred to by the Score elements of groups and gene refs.
notes: an optional notes object describing the origin and other details about these orthologs.
To avoid constructing the entire document in memory or holding all groups in memory, this function takes an iterable groups and yields string pieces of the xml
document. Useful for writing to a file, a network socket, etc.
returns: a generator that yields strings which can be concatenated to form an xml document.
'''
level = 0
yield '<?xml version="1.0" encoding="utf-8"?>' + newl
rootStart = '<orthoXML xmlns="http://orthoXML.org/2011/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="{}" '.format(version)
rootStart += 'origin="{}" originVersion="{}" xsi:schemaLocation="http://orthoXML.org/2011/ http://www.orthoxml.org/0.3/orthoxml.xsd">{}'.format(origin, originVersion, newl)
yield rootStart
if notes:
for xml in notes.toXml(indent, newl, level+1):
yield xml
# yield species one at a time b/c they can be big, containing tens of thousands of genes.
for spec in species:
for xml in spec.toXml(indent, newl, level+1):
yield xml
if scoreDefs:
yield indent*(level+1) + '<scores>{}'.format(newl)
for scoreDef in scoreDefs:
for xml in scoreDef.toXml(indent, newl, level+2):
yield xml
yield indent*(level+1) + '</scores>{}'.format(newl)
# yield groups one at a time, b/c there could be very many of them, too many to hold in memory
yield indent*(level+1) + '<groups>{}'.format(newl)
for group in groups:
for xml in group.toXml(indent, newl, level+2):
yield xml
yield indent*(level+1) + '</groups>{}'.format(newl)
rootEnd = '</orthoXML>{}'.format(newl)
yield rootEnd
if __name__ == '__main__':
pass