wdotcrawl/wikidot.py at master · himselfv/wdotcrawl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import requests
import random
from bs4 import BeautifulSoup
import time

# Implements various queries to Wikidot engine through its AJAX facilities


class Wikidot:
	def __init__(self, site):
		self.site = site		# Wikidot site to query
		self.delay = 200		# Delay between requests in msec
		self.debug = False		# Print debug messages
		self.next_timeslot = time.clock()	# Can call immediately


	# To honor usage rules, we wait for self.delay between requests.
	# Low-level query functions call this before every request to Wikidot./
	def _wait_request_slot(self):
		tm = time.clock()
		if self.next_timeslot - tm > 0:
			time.sleep(self.next_timeslot - tm)
		self.next_timeslot = tm + self.delay / 1000
		pass

	# Makes a Wikidot AJAX query. Returns the response+title or throws an error.
	def queryex(self, params):
		token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
		cookies = {"wikidot_token7": token}
		params['wikidot_token7'] = token

		if self.debug:
			print params
			print cookies

		self._wait_request_slot()
		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
		json = req.json()
		if json['status'] == 'ok':
			return json['body'], (json['title'] if 'title' in json else '')
		else:
			raise req.text

	# Same but only returns the body, most responses don't have titles
	def query(self, params):
		return self.queryex(params)[0]


	# List all pages for the site.

	# Raw version
	# For the supported formats (module_body) see:
	# See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
	def list_pages_raw(self, limit):
		res = self.query({
		  'moduleName': 'list/ListPagesModule',
		  'limit': limit if limit else '10000',
		  'perPage': limit if limit else '10000',
		  'module_body': '%%page_unix_name%%',
		  'separate': 'false',
		  'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
		})
		return res

	# Client version
	def list_pages(self, limit):
		raw = self.list_pages_raw(limit).replace('<br/>',"\n")
		soup = BeautifulSoup(raw, 'html.parser')
		pages = []
		for entry in soup.div.p.text.split('\n'):
			pages.append(entry)
		return pages


	# Retrieves internal page_id by page unix_name.
	# Page IDs are required for most of page functions.

	def get_page_id(self, page_unix_name):
		# The only freaking way to get page ID is to load the page! Wikidot!
		self._wait_request_slot()
		req = requests.request('GET', self.site+'/'+page_unix_name)
		soup = BeautifulSoup(req.text, 'html.parser')
		for item in soup.head.find_all('script'):
			text = item.text
			pos = text.find("WIKIREQUEST.info.pageId = ")
			if pos >= 0:
				pos += len("WIKIREQUEST.info.pageId = ")
				crlf = text.find(";", pos)
				if crlf >= 0:
					return int(text[pos:crlf])
				else:
					return int(text[pos:])
		return None


	# Retrieves a list of revisions for a page.
	# See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php

	# Raw version
	def get_revisions_raw(self, page_id, limit):
		res = self.query({
		  'moduleName': 'history/PageRevisionListModule',
		  'page_id': page_id,
		  'page': '1',
		  'perpage': limit if limit else '10000',
		  'options': '{"all":true}'
		})

		soup = BeautifulSoup(res, 'html.parser')
		return soup.table.contents

	# Client version
	def get_revisions(self, page_id, limit):
		revs = []
		for tr in self.get_revisions_raw(page_id, limit):
			if tr.name != 'tr': continue # there's a header + various junk

			# RevID is stored as a value of an INPUT field
			rev_id = tr.input['value'] if tr.input else None
			if rev_id is None: continue # can't parse

			# Unixtime is stored as a CSS class time_*
			rev_date = 0
			date_span = tr.find("span", attrs={"class": "odate"})
			if date_span is not None:
				for cls in date_span['class']:
					if cls.startswith('time_'):
						rev_date = int(cls[5:])

			# Username in a last <a> under <span class="printuser">
			user_span = tr.find("span", attrs={"class": "printuser"})
			for last_a in user_span.find_all('a'): pass
			rev_user = last_a.getText() if last_a else None


			# Comment is in the last TD of the row
			last_td = None
			for last_td in tr.find_all('td'): pass
			rev_comment = last_td.getText() if last_td else ""

			revs.append({
				'id': rev_id,
				'date': rev_date,
				'user': rev_user,
				'comment': rev_comment,
			})
		return revs


	# Retrieves revision source for a revision.
	# There's no raw version because there's nothing else in raw.
	def get_revision_source(self, rev_id):
		res = self.query({
		  'moduleName': 'history/PageSourceModule',
		  'revision_id': rev_id,
		  # We don't need page id
		})
		# The source is HTMLified but BeautifulSoup's getText() will decode that
		# - htmlentities
		# - <br/>s in place of linebreaks
		# - random real linebreaks (have to be ignored)
		soup = BeautifulSoup(res, 'html.parser')
		return soup.div.getText().lstrip(' \r\n')

	# Retrieves the rendered version + additional info unavailable in get_revision_source:
	# * Title
	# * Unixname at the time
	def get_revision_version_raw(self, rev_id):
		res = self.queryex({
		  'moduleName': 'history/PageVersionModule',
		  'revision_id': rev_id,
		})
		return res

	def get_revision_version(self, rev_id):
		res = self.get_revision_version_raw(rev_id) # this has title!
		soup = BeautifulSoup(res[0], 'html.parser')

		# First table is a flyout with revision details. Remove and study it.
		unixname = None
		details = soup.find("div", attrs={"id": "page-version-info"}).extract()
		for tr in details.find_all('tr'):
			tds = tr.find_all('td')
			if len(tds) < 2: continue
			if tds[0].getText().strip() == 'Page name:':
				unixname = tds[1].getText().strip()

		return {
		  'rev_id': rev_id,
		  'unixname': unixname,
		  'title': res[1],
		  'content': unicode(soup), # only content remains
		}