-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCT.gov_DataExtraction.py
More file actions
110 lines (82 loc) · 3.44 KB
/
CT.gov_DataExtraction.py
File metadata and controls
110 lines (82 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Importing packages
import pandas as pd
import numpy as np
import openpyxl as open
import regex as re
import requests
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Importing Excel file with list of project codes.
data = pd.read_excel(r'C:\Users\Desktop\NCT.xlsx')
NCT = data['TrialRegistryID'].tolist()
# Creating URL adress for each Project
NCT_web = []
for i in NCT:
NCT_web.append([i,f"https://xxxxxxxx.xxx/ct2/show/{i}?id={i}&draw=2&rank=1"])
df = pd.DataFrame(NCT_web, columns=['TrialRegistryID', "url"])
# Open URL for each Projects
Indication = dict()
Indication_names = dict()
Pr_Endpoints=dict()
All_Endpoints=dict()
Secondary_Endpoints=dict()
Exp_cohort=dict()
for i in df["url"]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(i)
# Number of Indications
indication = list()
content = driver.page_source
soup = BeautifulSoup(content)
for f in soup.findAll("div",attrs={'id':"tab-body"}):
for x in f.findAll("div",attrs={'class':'tr-indent2'}):
for y in x.findAll("div",attrs={'class':'tr-indent1'}):
for z in y.findAll("div",attrs={'class':'tr-indent2'}):
for a in z.findAll("table",attrs={'class':'ct-data_table tr-data_table'}):
for b in a.findAll("td",attrs={'class':'ct-body3'}):
for c in b.findAll("span"):
for d in c:
indication.append(str(d))
break
Indication_names[i] = indication
Indication[i] = len(indication)
# Number of Primary endpoints
Pr = list()
for m in soup.findAll("div",attrs={'id':"tab-body"}):
for x in m.findAll("div",attrs={'class':'tr-indent2'}):
for y in x.findAll("div",attrs={'class':'tr-indent3'}):
for d in y.findAll("div",attrs={'class':'ct-body3'}):
for c in d.findAll("ol"):
for f in d.findAll("li"):
Pr.append(str(f))
break
Pr_Endpoints[i] = len(Pr)
# Secondary and total endpoints
all_end =list()
for m in soup.findAll("div",attrs={'id':"tab-body"}):
for x in m.findAll("div",attrs={'class':'tr-indent2'}):
for y in x.findAll("div",attrs={'class':'tr-indent3'}):
for d in y.findAll("div",attrs={'class':'ct-body3'}):
for f in d.findAll("li"):
all_end.append(str(f))
All_Endpoints[i] = len(all_end)
Secondary_Endpoints[i] = len(all_end) - Pr_Endpoints[i]
# Ocurrences of "Expansion cohort"
Exp_cohort[i] = len(re.findall(r'(?i)expansion cohort|cohort expansion', requests.get(i).text))
# Append results to the initial DataFrame
df2 = df
Indication_Names = list(Indication_names.values())
Ind = list(Indication.values())
Pr_End = list(Pr_Endpoints.values())
Sr_End = list(Secondary_Endpoints.values())
All_End = list(All_Endpoints.values())
df2['Number of indications'] = Ind
df2['Indications'] = Indication_Names
df2['Primary endpoints'] = Pr_End
df2['Secondary endpoints'] = Sr_End
df2['All endpoints'] = All_End
# Exporting dataframe to Excel file
df2.to_excel("C:/Users/Desktop/NCT2.xlsx")