forked from oxfordmmm/catsup
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidate.py
More file actions
152 lines (128 loc) · 5.09 KB
/
Copy pathvalidate.py
File metadata and controls
152 lines (128 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#! /usr/bin/env python3
import csv
import json
import logging
import pathlib
import re
in_fieldnames_date_fields = ["sample_collection_date"]
in_fieldnames_email_fields = ["submitter_email"]
in_fieldnames_filepath_fields = ["sample_filename"]
def validate_date(s):
try:
year = int(s[0:4])
if year < 1900 or year > 2100:
raise Exception
month = int(s[5:7])
if month < 1 or month > 12:
raise Exception
day = int(s[8:10])
if day < 1 or day > 31:
raise Exception
return True
except:
return False
def validate_email(s):
regex = "^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$"
if re.search(regex, s):
return True
else:
return False
def validate_filepath(s):
return pathlib.Path(s).exists()
def validate_template(reader):
failed = False
for row in reader:
for k, v in row.items():
if not v:
logging.error(f"Found empty value in row: {row}")
logging.error(f"Column {k} is empty")
failed = True
if k in in_fieldnames_date_fields and not validate_date(v):
logging.error(f"Found date in invalid format: {row}")
logging.error(f"Column {k} date is {v}")
logging.error("Expected date format YYYY-MM-DD")
failed = True
if k in in_fieldnames_email_fields and not validate_email(v):
logging.error(f"Found email in invalid format: {row}")
logging.error(f"Column {k} email format is {v}")
failed = True
if k in in_fieldnames_filepath_fields and not validate_filepath(v):
logging.error(f"File path {v} doesn't exist in {row}")
logging.error(f"Column {k} file path is {v}")
failed = True
return not failed
def validate_config(config):
if type(config) != dict:
logging.error("Failed to validate config:")
logging.error("Config is not a dictionary")
return False
must_have_keys = [
"number_of_example_samples",
"pipeline",
"pipelines",
"nextflow_additional_params",
]
for must_have_key in must_have_keys:
if must_have_key not in config:
logging.error("Failed to validate config:")
logging.error(f"Key '{must_have_key}' missing from config")
return False
pipeline = config["pipeline"]
pipelines = config["pipelines"]
if type(pipeline) != str:
logging.error("Failed to validate config:")
logging.error("Key pipeline is not a string")
return False
if type(pipelines) != dict:
logging.error("Failed to validate config:")
logging.error("Key pipelines is not a dict")
return False
if pipeline not in pipelines:
logging.error("Failed to validate config:")
logging.error(f"Pipeline {pipeline} has no configuration")
return False
pipeline_must_have_keys = ["script", "image", "human_ref"]
for pipeline_name, pipeline_conf in pipelines.items():
if type(pipeline_conf) != dict:
logging.error("Failed to validate config:")
logging.error(f"{pipeline_name} value is not a dict")
return False
for pipeline_must_have_key in pipeline_must_have_keys:
if pipeline_must_have_key not in pipeline_conf:
logging.error("Failed to validate config:")
logging.error(
f"{pipeline_name} config is missing key {pipeline_must_have_key}"
)
return False
for k, v in pipeline_conf.items():
if not pathlib.Path(v).exists():
logging.error("Failed to validate config:")
logging.error(
f"Pipeline config {pipeline_name}: file {v} does not exist"
)
return False
if "upload" in config:
if "s3" in config["upload"]:
if "bucket" in config["upload"]["s3"]:
s3_must_have_keys = ["bucket", "s3cmd-config"]
for upload_must_have_key in s3_must_have_keys:
if upload_must_have_key not in config["upload"]["s3"]:
logging.error("Failed to validate config:")
logging.error(
f"S3 config s3: key {upload_must_have_key} does not exist"
)
return False
cfg_file = config["upload"]["s3"]["s3cmd-config"]
if not validate_filepath(cfg_file):
logging.error("Failed to validate config:")
logging.error(f"S3 config file: file {cfg_file} does not exist")
return False
elif "par_url" in config["upload"]["s3"]:
pass
else:
logging.error("Failed to validate config:")
logging.error(
f"S3 config: you need either a bucket or par_url key in upload.s3"
)
return False
return True