-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemp.py
More file actions
86 lines (73 loc) · 3.23 KB
/
Copy pathtemp.py
File metadata and controls
86 lines (73 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import sys
import os
os.environ["path"] = os.path.dirname(sys.executable) + ";" + os.environ["path"]
import re
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
def downloadURL(url):
try:
print("request : " + url)
req = urllib.request.Request(url)
req.add_header('Referer', 'http://financials.morningstar.com/ratios/r.html?t=AQN®ion=can&culture=en-US')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.1 \
(KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')
resp = urllib.request.urlopen(req)
#print("resp = " + str(resp))
data = resp.read()
#print("data = " + str(data))
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
#print("text = " + text)
except http.client.IncompleteRead as e:
print("partial")
data = e.partial
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
except urllib.error.HTTPError as e:
print("URL failed : " + str(e.code) + " " + e.reason)
return "failed"
except UnicodeDecodeError as e:
print("URL failed : Response not unicode")
return "failed"
except Exception as e:
print("Unknown exception : " + str(e))
return "failed"
return text
if __name__ == '__main__':
a = "http://financials.morningstar.com/finan/ajax/exportKR2CSV.html?&callback=?&t=XTSE:AQN®ion=can&culture=en-US&cur=&order=asc"
print(str(downloadURL(a)))
sys.exit(0)
text = '<table class="data-table tablemaster">\
<tr class="tableheader">\
<th colspan="5">Description & Contact Information</th>\
</tr>\
<tr>\
<td class="label">Business Description:</td>\
<td class="data" colspan="4">WPT Industrial Real Estate Investment Trust (the REIT) is an open-ended real estate investment trust. The REIT is engaged in the business of acquiring and owning industrial investment properties located in the United States. Its objective is to provide Unitholders with an opportunity to invest in a portfolio of institutional-quality industrial properties in the United States markets, with a particular focus on distribution of the industrial real estate.</td>\
</tr>\
<tr>\
<td class="label">Address:</td>\
<td class="data" colspan="4">199 Bay Street, Suite 4000, Toronto, ON, CAN, M5L 1A9 </td>\
</tr>\
<tr>\
<td class="label">Telephone:</td>\
<td class="data">+1 612 800-8503</td>\
<td class="spacer"><span></span></td>\
<td class="label">Website:</td>\
<td class="data"><a href="http://www.wptreit.com" target="_blank">http://www.wptreit.com</a></td></tr>\
<tr>\
<td class="label">Facsimile:</td>\
<td class="data">+1 612 800-8535</td>\
<td class="spacer"><span></span></td>\
<td class="label">Email:</td>\
<td class="data"><a title="Email" href="javascript:qmsm(\'welshpt.com:?:stf\');">stf@welshpt.com</a></td>\
</tr>\
'
#soup.select("td[class='label'] ~ td[class='data']")
#soup.find_all(name="td", class_="label", text="Business Description").find_next_siblings("td").text
#css_soup.select("td.label")
soup = BeautifulSoup(text, "html.parser", from_encoding="utf-8")
#b = soup.select("td[class='label']")
b = soup.find_all(name="td", class_="label", text=re.compile("Business Description"))
for v in b:
print(v.find_next_sibling("td").text)
print(b)