diff --git a/doc/api/driver.phantomjs.rst b/doc/api/driver.phantomjs.rst deleted file mode 100644 index c9fedb9a..00000000 --- a/doc/api/driver.phantomjs.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. This document was automatically generated. - DO NOT EDIT! - -:mod:`driver.phantomjs` Module -============================== - -.. automodule:: wpull.driver.phantomjs - :members: - :show-inheritance: - :undoc-members: diff --git a/doc/api/processor.coprocessor.phantomjs.rst b/doc/api/processor.coprocessor.phantomjs.rst deleted file mode 100644 index 6b23299a..00000000 --- a/doc/api/processor.coprocessor.phantomjs.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. This document was automatically generated. - DO NOT EDIT! - -:mod:`processor.coprocessor.phantomjs` Module -============================================= - -.. automodule:: wpull.processor.coprocessor.phantomjs - :members: - :show-inheritance: - :undoc-members: diff --git a/doc/differences.rst b/doc/differences.rst index bdf13b56..a9cb9055 100644 --- a/doc/differences.rst +++ b/doc/differences.rst @@ -69,12 +69,5 @@ Missing in Wget * ``--proxy-server`` * ``--proxy-server-address`` * ``--proxy-server-port`` -* ``--phantomjs`` -* ``--phantomjs-exe`` -* ``--phantomjs-max-time`` -* ``--phantomjs-scroll`` -* ``--phantomjs-wait`` -* ``--no-phantomjs-snapshot`` -* ``--no-phantomjs-smart-scroll`` * ``--youtube-dl`` * ``--youtube-dl-exe`` diff --git a/doc/install.rst b/doc/install.rst index 1febbf93..20fdd8c1 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -25,8 +25,6 @@ The following are optional: * `psutil` for monitoring disk space * `Manhole `_ for a REPL debugging socket -* `PhantomJS 1.9.8, 2.1 `_ for capturing interactive - JavaScript pages * `youtube-dl `_ for downloading complex video streaming sites @@ -116,9 +114,3 @@ pip. Note for Linux users, ensure you are executing the appropriate Python version when installing pip. -PhantomJS (Optional) -++++++++++++++++++++ - -It is recommended to download a prebuilt binary build from -http://phantomjs.org/download.html. - diff --git a/doc/intro.rst b/doc/intro.rst index f9d4d0f1..41f0454b 100644 --- a/doc/intro.rst +++ b/doc/intro.rst @@ -15,7 +15,7 @@ Notable Features: * Written in Python: lightweight, modifiable, robust, & scriptable * Graceful stopping; on-disk database resume -* PhantomJS & youtube-dl integration (experimental) +* youtube-dl integration (experimental) .. ⬆ Please keep this intro above in sync with the README file. ⬆ Additional intro stuff not in the README should go below. diff --git a/doc/usage.rst b/doc/usage.rst index 8b084c4a..b8d83e59 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -83,20 +83,6 @@ The requests will go through the proxy to Wpull's HTTP client (which can be reco It is not possible to use the proxy standalone at this time. -PhantomJS Integration -+++++++++++++++++++++ - -**PhantomJS support is currently experimental.** - -``--phantomjs`` will enable PhantomJS integration. - -If a HTML document is encountered, Wpull will open the URL in PhantomJS. After the page is loaded, Wpull will try to scroll the page as specified by ``--phantomjs-scroll``. Then, the HTML DOM source is scraped for URLs as normal. HTML and PDF snapshots are taken by default. - -Currently, Wpull will *not do anything else* to manipulate the page such as clicking on links. As a consequence, Wpull with PhantomJS is *not* a complete solution for dynamic web pages yet! - -Storing console logs and alert messages inside the WARC file is not yet supported. - - youtube-dl Integration ++++++++++++++++++++++ diff --git a/doc/warc.rst b/doc/warc.rst index 2c3ed6b6..70844638 100644 --- a/doc/warc.rst +++ b/doc/warc.rst @@ -45,35 +45,6 @@ The response data is recorded as * WARC-Concurrent-To: a WARC Record ID of the Control Conversation -PhantomJS -+++++++++ - - -Snapshot --------- - -A PhantomJS Snapshot represents the state of the DOM at the time of capture. - -A Snapshot is recorded as - -* WARC-Type: ``resource`` -* WARC-Target-URI: ``urn:X-wpull:snapshot?url=URLHERE`` where ``URLHERE`` is a percent-encoded URL of the PhantomJS page. -* Content-Type: one of ``application/pdf``, ``text/html``, ``image/png`` -* WARC-Concurrent-To: a WARC Record ID of a Snapshot Action Metadata. - - -Snapshot Action Metadata ------------------------- - -An Action Metadata is a log of steps performed before a Snapshot is taken. - -It is recorded as - -* WARC-Type: ``metadata`` -* Content-Type: ``application/json`` -* WARC-Target-URI: ``urn:X-wpull:snapshot?url=URLHERE`` where ``URLHERE`` is a percent-encoded URL of the PhantomJS page. - - Wpull Metadata ++++++++++++++ diff --git a/wpull/application/builder.py b/wpull/application/builder.py index 2a13e69d..6a8e934d 100644 --- a/wpull/application/builder.py +++ b/wpull/application/builder.py @@ -32,7 +32,6 @@ from wpull.cookie import DeFactoCookiePolicy from wpull.database.sqltable import URLTable as SQLURLTable from wpull.database.wrap import URLTableHookWrapper -from wpull.driver.phantomjs import PhantomJSDriver from wpull.network.bandwidth import BandwidthLimiter from wpull.network.dns import Resolver from wpull.network.pool import ConnectionPool @@ -40,7 +39,6 @@ from wpull.pipeline.app import AppSource, AppSession from wpull.pipeline.pipeline import Pipeline, PipelineSeries from wpull.pipeline.session import URLItemSource -from wpull.processor.coprocessor.phantomjs import PhantomJSCoprocessor from wpull.processor.coprocessor.proxy import ProxyCoprocessor from wpull.processor.coprocessor.youtubedl import YoutubeDlCoprocessor from wpull.processor.delegate import DelegateProcessor @@ -106,8 +104,6 @@ def __init__(self, args, unit_test=False): 'HTMLScraper': HTMLScraper, 'JavaScriptScraper': JavaScriptScraper, 'PathNamer': PathNamer, - 'PhantomJSDriver': PhantomJSDriver, - 'PhantomJSCoprocessor': PhantomJSCoprocessor, 'PipelineSeries': PipelineSeries, 'ProcessingRule': ProcessingRule, 'Processor': DelegateProcessor, diff --git a/wpull/application/options.py b/wpull/application/options.py index aea41cc6..3cb85879 100644 --- a/wpull/application/options.py +++ b/wpull/application/options.py @@ -205,7 +205,6 @@ def _add_app_args(self): self._add_recursive_args() self._add_accept_args() self._add_proxy_server_args() - self._add_phantomjs_args() self._add_youtube_dl_args() def _add_startup_args(self): @@ -1303,54 +1302,6 @@ def _add_proxy_server_args(self): help=_('bind the proxy server port to PORT') ) - def _add_phantomjs_args(self): - group = self.add_argument_group(_('PhantomJS')) - group.add_argument( - '--phantomjs', - action='store_true', - help=_('use PhantomJS for loading dynamic pages'), - ) - group.add_argument( - '--phantomjs-exe', - metavar='PATH', - default='phantomjs', - help=_('path of PhantomJS executable') - ) - group.add_argument( - '--phantomjs-max-time', - default=900, - type=self.int_0_inf, - help=_('maximum duration of PhantomJS session') - ) - group.add_argument( - '--phantomjs-scroll', - type=int, - default=20, - metavar='NUM', - help=_('scroll the page up to NUM times'), - ) - group.add_argument( - '--phantomjs-wait', - type=float, - default=1.0, - metavar='SEC', - help=_('wait SEC seconds between page interactions'), - ) - group.add_argument( - '--no-phantomjs-snapshot', - action='store_false', - dest='phantomjs_snapshot', - default=True, - help=_('don’t take dynamic page snapshots'), - ) - group.add_argument( - '--no-phantomjs-smart-scroll', - action='store_false', - dest='phantomjs_smart_scroll', - default=True, - help=_('always scroll the page to maximum scroll count option'), - ) - def _add_youtube_dl_args(self): group = self.add_argument_group(_('youtube-dl')) group.add_argument( diff --git a/wpull/application/tasks/download.py b/wpull/application/tasks/download.py index bbbb552e..206bebbd 100644 --- a/wpull/application/tasks/download.py +++ b/wpull/application/tasks/download.py @@ -7,7 +7,6 @@ from wpull.backport.logging import BraceMessage as __ from wpull.cookie import BetterMozillaCookieJar -from wpull.processor.coprocessor.phantomjs import PhantomJSParams from wpull.namevalue import NameValueRecord from wpull.pipeline.pipeline import ItemTask from wpull.pipeline.session import ItemSession @@ -18,7 +17,6 @@ from wpull.protocol.http.stream import Stream as HTTPStream import wpull.util import wpull.processor.coprocessor.youtubedl -import wpull.driver.phantomjs import wpull.application.hook _logger = logging.getLogger(__name__) @@ -224,7 +222,7 @@ class ProxyServerSetupTask(ItemTask[AppSession]): def process(self, session: AppSession): '''Build MITM proxy server.''' args = session.args - if not (args.phantomjs or args.youtube_dl or args.proxy_server): + if not (args.youtube_dl or args.proxy_server): return proxy_server = session.factory.new( @@ -388,81 +386,15 @@ class CoprocessorSetupTask(ItemTask[ItemSession]): @asyncio.coroutine def process(self, session: AppSession): args = session.args - if args.phantomjs or args.youtube_dl or args.proxy_server: + if args.youtube_dl or args.proxy_server: proxy_port = session.proxy_server_port assert proxy_port - if args.phantomjs: - phantomjs_coprocessor = self._build_phantomjs_coprocessor(session, proxy_port) - else: - phantomjs_coprocessor = None - if args.youtube_dl: youtube_dl_coprocessor = self._build_youtube_dl_coprocessor(session, proxy_port) else: youtube_dl_coprocessor = None - @classmethod - def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int): - '''Build proxy server and PhantomJS client. controller, coprocessor.''' - page_settings = {} - default_headers = NameValueRecord() - - for header_string in session.args.header: - default_headers.parse(header_string) - - # Since we can only pass a one-to-one mapping to PhantomJS, - # we put these last since NameValueRecord.items() will use only the - # first value added for each key. - default_headers.add('Accept-Language', '*') - - if not session.args.http_compression: - default_headers.add('Accept-Encoding', 'identity') - - default_headers = dict(default_headers.items()) - - if session.args.read_timeout: - page_settings['resourceTimeout'] = session.args.read_timeout * 1000 - - page_settings['userAgent'] = session.args.user_agent \ - or session.default_user_agent - - # Test early for executable - wpull.driver.phantomjs.get_version(session.args.phantomjs_exe) - - phantomjs_params = PhantomJSParams( - wait_time=session.args.phantomjs_wait, - num_scrolls=session.args.phantomjs_scroll, - smart_scroll=session.args.phantomjs_smart_scroll, - snapshot=session.args.phantomjs_snapshot, - custom_headers=default_headers, - page_settings=page_settings, - load_time=session.args.phantomjs_max_time, - ) - - extra_args = [ - '--proxy', - '{}:{}'.format(session.args.proxy_server_address, proxy_port), - '--ignore-ssl-errors=true' - ] - - phantomjs_driver_factory = functools.partial( - session.factory.class_map['PhantomJSDriver'], - exe_path=session.args.phantomjs_exe, - extra_args=extra_args, - ) - - phantomjs_coprocessor = session.factory.new( - 'PhantomJSCoprocessor', - phantomjs_driver_factory, - session.factory['ProcessingRule'], - phantomjs_params, - root_path=session.args.directory_prefix, - warc_recorder=session.factory.get('WARCRecorder'), - ) - - return phantomjs_coprocessor - @classmethod def _build_youtube_dl_coprocessor(cls, session: AppSession, proxy_port: int): '''Build youtube-dl coprocessor.''' diff --git a/wpull/application/tasks/warc.py b/wpull/application/tasks/warc.py index be9c2ceb..f5791915 100644 --- a/wpull/application/tasks/warc.py +++ b/wpull/application/tasks/warc.py @@ -9,7 +9,6 @@ from wpull.pipeline.app import AppSession from wpull.pipeline.pipeline import ItemTask from wpull.warc.recorder import WARCRecorder, WARCRecorderParams -import wpull.driver.phantomjs import wpull.processor.coprocessor.youtubedl import wpull.warc.format @@ -43,11 +42,6 @@ def process(self, session: AppSession): software_string = WARCRecorder.DEFAULT_SOFTWARE_STRING - if args.phantomjs: - software_string += ' PhantomJS/{0}'.format( - wpull.driver.phantomjs.get_version(exe_path=args.phantomjs_exe) - ) - if args.youtube_dl: software_string += ' youtube-dl/{0}'.format( wpull.processor.coprocessor.youtubedl.get_version(exe_path=args.youtube_dl_exe) diff --git a/wpull/driver/Makefile b/wpull/driver/Makefile deleted file mode 100644 index cefb74e7..00000000 --- a/wpull/driver/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -all: phantomjs - -phantomjs: PhantomJS.hx - haxe -main PhantomJS -js phantomjs.js diff --git a/wpull/driver/PhantomJS.hx b/wpull/driver/PhantomJS.hx deleted file mode 100644 index eb63863c..00000000 --- a/wpull/driver/PhantomJS.hx +++ /dev/null @@ -1,521 +0,0 @@ -import haxe.Json; -import js.Browser; - -using StringTools; - - -class PhantomJS { - var system:Dynamic; - var webpage:Dynamic; - var phantom:Dynamic; - var fs:Dynamic; - var page:Dynamic; - var config:Dynamic; - var eventLogFile:Dynamic; - var actionLogFile:Dynamic; - var activityCounter = 0; - var pendingResourcesAfterLoad = 0; - var pageLoaded = false; - - public function new() { - system = untyped __js__("require")("system"); - webpage = untyped __js__("require")("webpage"); - phantom = untyped __js__("phantom"); - fs = untyped __js__("require")("fs"); - } - - public static function main() { - var app = new PhantomJS(); - app.run(); - } - - function logStderrLine(message:String) { - if (system.stderr != null) { - return system.stderr.writeLine(message); - } else { - return system.stdout.writeLine(message); - } - } - - /** - * Do the entire process pipeline. - */ - public function run() { - setUpErrorHandler(); - loadConfig(); - createPage(); - listenPageEvents(); - loadUrl(); - } - - /** - * Set up error handler which logs to stderr - */ - function setUpErrorHandler() { - phantom.onError = function (message:String, traceArray:Array) { - logStderrLine(message); - - for (traceLine in traceArray) { - var source:String; - var functionName:String = ""; - - if (traceLine.file != null) { - source = traceLine.file; - } else { - source = traceLine.sourceURL; - } - - if (Reflect.field(traceLine, "function") != null) { - functionName = Reflect.field(traceLine, "function"); - } - - logStderrLine(' $source:${traceLine.line} $functionName'); - } - } - } - - /** - * Load the launch configuration. - */ - function loadConfig() { - if (system.args.length != 2) { - throw "Missing launch configuration."; - } - - var configContent = fs.read(system.args[1]); - config = Json.parse(configContent); - - openLogFiles(); - } - - /** - * Open the event and action log files. - */ - function openLogFiles() { - var eventLogFilename:String = Reflect.field(config, "event_log_filename"); - var actionLogFilename:String = Reflect.field(config, "action_log_filename"); - - if (eventLogFilename != null) { - eventLogFile = fs.open(eventLogFilename, "w"); - } - - if (actionLogFilename != null) { - actionLogFile = fs.open(actionLogFilename, "w"); - } - } - - /** - * Create the page and set up the page settings. - */ - function createPage() { - page = webpage.create(); - - page.evaluate("function () { document.body.bgColor = 'white'; }"); - page.viewportSize = { - width: Reflect.field(config, 'viewport_width'), - height: Reflect.field(config, 'viewport_height') - }; - - var paperWidth:Int = Reflect.field(config, 'paper_width'); - var paperHeight:Int = Reflect.field(config, 'paper_height'); - - page.paperSize = { - width: '$paperWidth px', - height: '$paperHeight px', - border: "0px" - }; - - page.customHeaders = Reflect.field(config, 'custom_headers'); - - var settings = Reflect.field(config, 'page_settings'); - for (name in Reflect.fields(settings)) { - Reflect.setField(page.settings, name, Reflect.field(settings, name)); - } - } - - /** - * Set up the page event callbacks. - */ - function listenPageEvents() { - page.onAlert = function (message) { - logEvent("alert", {message: message}); - } - - page.onClosing = function (closingPage) { - logEvent("closing"); - } - - page.onConfirm = function (message) { - logEvent("confirm", {message: message}); - return false; - } - - page.onConsoleMessage = function (message, lineNum, sourceId) { - logEvent( - "console_message", - { - message: message, - line_num: lineNum, - source_id: sourceId, - } - ); - } - - page.onError = function (message, trace) { - logEvent("error", {message: message, trace: trace}); - } - - page.onFilePicker = function (oldFile) { - logEvent("file_picker", {old_file: oldFile}); - return null; - } - - page.onInitialized = function () { - logEvent("initialized"); - } - - page.onLoadFinished = function (status) { - logEvent("load_finished", {status: status}); - activityCounter += 1; - } - - page.onLoadStarted = function () { - logEvent("load_started"); - activityCounter += 1; - } - - page.onNavigationRequested = function (url, type, willNavigate, main) { - logEvent("navigation_requested", { - 'url': url, - 'type': type, - 'will_navigate': willNavigate, - 'main': main - }); - } - - page.onPageCreated = function (newPage) { - logEvent("page_created", {}); - } - - page.onPrompt = function (message, defaultValue) { - logEvent("prompt", { - message: message, - default_value: defaultValue, - }); - return null; - } - - page.onResourceError = function (resourceError) { - logEvent("resource_error", { - resource_error: resourceError, - }); - activityCounter += 1; - if (pageLoaded) { - pendingResourcesAfterLoad -= 1; - } - } - - page.onResourceReceived = function (response) { - logEvent("resource_received", { - response: response - }); - activityCounter += 1; - if (pageLoaded && response.stage == "end") { - pendingResourcesAfterLoad -= 1; - } - } - - page.onResourceRequested = function (requestData, networkRequest) { - logEvent("resource_requested", { - request_data: requestData, - network_request: networkRequest - }); - activityCounter += 1; - if (pageLoaded) { - pendingResourcesAfterLoad += 1; - } - } - - page.onResourceTimeout = function (request) { - logEvent("resource_timeout", { - request: request - }); - activityCounter += 1; - if (pageLoaded) { - pendingResourcesAfterLoad -= 1; - } - } - - page.onUrlChanged = function (targetUrl) { - logEvent("url_changed", {target_url: targetUrl}); - } - } - - /** - * Write a page event to the log. - */ - function logEvent(eventName:String, ?eventData:Dynamic) { - if (eventLogFile == null) { - return; - } - - var line = Json.stringify({ - timestamp: Date.now().getTime() / 1000.0, - event: eventName, - value: eventData - }); - eventLogFile.write(line); - eventLogFile.write('\n'); - } - - /** - * Write a page manipulation action to the log. - */ - function logAction(eventName:String, ?eventData:Dynamic) { - if (actionLogFile == null) { - return; - } - - var line = Json.stringify({ - timestamp: Date.now().getTime() / 1000.0, - event: eventName, - value: eventData - }); - actionLogFile.write(line); - actionLogFile.write('\n'); - } - - /** - * Load the URL. - */ - function loadUrl() { - var url:String = Reflect.field(config, "url"); - - trace('Load URL $url.'); - page.open(url, function (status) { - trace('Page loaded! $status.'); - pageLoaded = true; - }); - // For PhantomJS, we need to poll so that the callback isn't in - // the page scope but in here scope. If we don't do this, - // errors don't bubble up and weird security access errors occurs. - pollPageLoad(); - } - - /** - * Pool for page loaded. - */ - function pollPageLoad() { - trace("Polling for load."); - - if (pageLoaded) { - loadFinishedCallback(); - } else { - Browser.window.setTimeout(pollPageLoad, 100); - } - } - - /** - * Callback when page has loaded. - */ - function loadFinishedCallback() { - trace("Load finished."); - - if (isPageDynamic()) { - scrollPage(); - } else { - loadFinishedCallback2(); - } - } - - /** - * Callback when page was scrolled. - */ - function loadFinishedCallback2() { - if (Reflect.field(config, 'snapshot')) { - makeSnapshots(); - } - - close(); - } - - /** - * Return whether the page uses JavaScript. - */ - function isPageDynamic():Bool { - var result:Bool = page.evaluate(" - function () { - return document.getElementsByTagName('script').length || - document.querySelector( - '[onload],[onunload],[onabortonclick],[ondblclick],' + - '[onmousedown],[onmousemove],[onmouseout],[onmouseover],' + - '[onmouseup],[onkeydown],[onkeypress],[onkeyup]'); - } - "); - return result; - } - - function getPageContentHeight():Int { - return page.evaluate("function() { return document.body.scrollHeight; }"); - } - - /** - * Scroll the page to the bottom and then back to top. - */ - function scrollPage() { - var currentY = 0; - var scrollDelay:Int = cast(Reflect.field(config, 'wait_time') * 1000, Int); - var numScrolls:Int = Reflect.field(config, 'num_scrolls'); - var smartScroll:Bool = Reflect.field(config, 'smart_scroll'); - var startDate:Date = null; - - // Try to get rid of any stupid "sign up now" overlays. - var clickX:Int = page.viewportSize.width; - var clickY:Int = page.viewportSize.height; - logAction('click', [clickX, clickY]); - sendClick(clickX, clickY); - - function pollForPendingLoad() { - if (startDate == null) { - startDate = Date.now(); - } - - var duration = Date.now().getTime() - startDate.getTime(); - - trace('pendingResourcesAfterLoad=$pendingResourcesAfterLoad'); - - if (pendingResourcesAfterLoad > 0 && duration < 60000) { - Browser.window.setTimeout(pollForPendingLoad, 100); - } else { - loadFinishedCallback2(); - } - } - - function cleanupScroll() { - logAction("set_scroll_left", 0); - logAction("set_scroll_top", 0); - - setPagePosition(0, 0); - sendKey(page.event.key.Home); - - pollForPendingLoad(); - } - - function actualScroll() { - var beforeActivityCount = activityCounter; - currentY += 768; - - trace('Scroll page $currentY. numScrolls=$numScrolls.'); - logAction("set_scroll_left", 0); - logAction("set_scroll_top", currentY); - - setPagePosition(0, currentY); - sendKey(page.event.key.PageDown); - - Browser.window.setTimeout(function () { - var pageHeight = getPageContentHeight(); - - if (pageHeight == null) { - pageHeight = 0; - } - - trace('before=$beforeActivityCount activityCounter=$activityCounter'); - trace('currentY=$currentY pageHeight=$pageHeight'); - - if (smartScroll && beforeActivityCount == activityCounter && currentY >= pageHeight) { - cleanupScroll(); - return; - } - - numScrolls -= 1; - - if (numScrolls > 0) { - actualScroll(); - } else { - cleanupScroll(); - } - }, scrollDelay); - }; - - actualScroll(); - } - - /** - * Scroll the page to a position. - */ - function setPagePosition(x:Int, y:Int) { - page.scrollPosition = {left: x, top: y}; - page.evaluate(' - function () { - if (window) { - window.scrollTo($x, $y); - } - } - '); - } - - /** - * Send a mouse click to the page. - */ - function sendClick(x:Int, y:Int, button:String = "left") { - page.sendEvent("mousedown", x, y, button); - page.sendEvent("mouseup", x, y, button); - page.sendEvent("click", x, y, button); - } - - /** - * Send a keyboard event to the page. - */ - function sendKey(key:Int, modifier:Int = 0) { - page.sendEvent("keypress", key, null, null, modifier); - page.sendEvent("keydown", key, null, null, modifier); - page.sendEvent("keyup", key, null, null, modifier); - } - - /* - * Render the snapshot files. - */ - function makeSnapshots() { - var paths:Array = Reflect.field(config, "snapshot_paths"); - - for (path in paths) { - trace('Making snapshot $path'); - renderPage(path); - } - } - - /* - * Render page and save to given path. - */ - function renderPage(path:String) { - if (path.endsWith(".html")) { - var file = fs.open(path, "w"); - file.write(page.content); - file.close(); - } else { - page.render(path); - } - } - - /* - * Clean up and exit. - */ - function close() { - trace("Closing."); - page.close(); - - if (actionLogFile != null) { - actionLogFile.flush(); - // XXX: Segfault on at least 1.9.8 - // actionLogFile.close(); - } - - if (eventLogFile != null) { - eventLogFile.flush(); - // XXX: Segfault on at least 1.9.8 - // eventLogFile.close(); - } - - phantom.exit(); - } -} diff --git a/wpull/driver/phantomjs.js b/wpull/driver/phantomjs.js deleted file mode 100644 index 1355edc8..00000000 --- a/wpull/driver/phantomjs.js +++ /dev/null @@ -1,455 +0,0 @@ -(function () { "use strict"; -var HxOverrides = function() { }; -HxOverrides.__name__ = true; -HxOverrides.substr = function(s,pos,len) { - if(pos != null && pos != 0 && len != null && len < 0) return ""; - if(len == null) len = s.length; - if(pos < 0) { - pos = s.length + pos; - if(pos < 0) pos = 0; - } else if(len < 0) len = s.length + len - pos; - return s.substr(pos,len); -}; -var PhantomJS = function() { - this.pageLoaded = false; - this.pendingResourcesAfterLoad = 0; - this.activityCounter = 0; - this.system = require("system"); - this.webpage = require("webpage"); - this.phantom = phantom; - this.fs = require("fs"); -}; -PhantomJS.__name__ = true; -PhantomJS.main = function() { - var app = new PhantomJS(); - app.run(); -}; -PhantomJS.prototype = { - logStderrLine: function(message) { - if(this.system.stderr != null) return this.system.stderr.writeLine(message); else return this.system.stdout.writeLine(message); - } - ,run: function() { - this.setUpErrorHandler(); - this.loadConfig(); - this.createPage(); - this.listenPageEvents(); - this.loadUrl(); - } - ,setUpErrorHandler: function() { - var _g = this; - this.phantom.onError = function(message,traceArray) { - _g.logStderrLine(message); - var _g1 = 0; - while(_g1 < traceArray.length) { - var traceLine = traceArray[_g1]; - ++_g1; - var source; - var functionName = ""; - if(traceLine.file != null) source = traceLine.file; else source = traceLine.sourceURL; - if(Reflect.field(traceLine,"function") != null) functionName = Reflect.field(traceLine,"function"); - _g.logStderrLine(" " + source + ":" + Std.string(traceLine.line) + " " + functionName); - } - }; - } - ,loadConfig: function() { - if(this.system.args.length != 2) throw "Missing launch configuration."; - var configContent = this.fs.read(this.system.args[1]); - this.config = JSON.parse(configContent); - this.openLogFiles(); - } - ,openLogFiles: function() { - var eventLogFilename = Reflect.field(this.config,"event_log_filename"); - var actionLogFilename = Reflect.field(this.config,"action_log_filename"); - if(eventLogFilename != null) this.eventLogFile = this.fs.open(eventLogFilename,"w"); - if(actionLogFilename != null) this.actionLogFile = this.fs.open(actionLogFilename,"w"); - } - ,createPage: function() { - this.page = this.webpage.create(); - this.page.evaluate("function () { document.body.bgColor = 'white'; }"); - this.page.viewportSize = { width : Reflect.field(this.config,"viewport_width"), height : Reflect.field(this.config,"viewport_height")}; - var paperWidth = Reflect.field(this.config,"paper_width"); - var paperHeight = Reflect.field(this.config,"paper_height"); - this.page.paperSize = { width : "" + paperWidth + " px", height : "" + paperHeight + " px", border : "0px"}; - this.page.customHeaders = Reflect.field(this.config,"custom_headers"); - var settings = Reflect.field(this.config,"page_settings"); - var _g = 0; - var _g1 = Reflect.fields(settings); - while(_g < _g1.length) { - var name = _g1[_g]; - ++_g; - Reflect.setField(this.page.settings,name,Reflect.field(settings,name)); - } - } - ,listenPageEvents: function() { - var _g = this; - this.page.onAlert = function(message) { - _g.logEvent("alert",{ message : message}); - }; - this.page.onClosing = function(closingPage) { - _g.logEvent("closing"); - }; - this.page.onConfirm = function(message1) { - _g.logEvent("confirm",{ message : message1}); - return false; - }; - this.page.onConsoleMessage = function(message2,lineNum,sourceId) { - _g.logEvent("console_message",{ message : message2, line_num : lineNum, source_id : sourceId}); - }; - this.page.onError = function(message3,trace) { - _g.logEvent("error",{ message : message3, trace : trace}); - }; - this.page.onFilePicker = function(oldFile) { - _g.logEvent("file_picker",{ old_file : oldFile}); - return null; - }; - this.page.onInitialized = function() { - _g.logEvent("initialized"); - }; - this.page.onLoadFinished = function(status) { - _g.logEvent("load_finished",{ status : status}); - _g.activityCounter += 1; - }; - this.page.onLoadStarted = function() { - _g.logEvent("load_started"); - _g.activityCounter += 1; - }; - this.page.onNavigationRequested = function(url,type,willNavigate,main) { - _g.logEvent("navigation_requested",{ url : url, type : type, will_navigate : willNavigate, main : main}); - }; - this.page.onPageCreated = function(newPage) { - _g.logEvent("page_created",{ }); - }; - this.page.onPrompt = function(message4,defaultValue) { - _g.logEvent("prompt",{ message : message4, default_value : defaultValue}); - return null; - }; - this.page.onResourceError = function(resourceError) { - _g.logEvent("resource_error",{ resource_error : resourceError}); - _g.activityCounter += 1; - if(_g.pageLoaded) _g.pendingResourcesAfterLoad -= 1; - }; - this.page.onResourceReceived = function(response) { - _g.logEvent("resource_received",{ response : response}); - _g.activityCounter += 1; - if(_g.pageLoaded && response.stage == "end") _g.pendingResourcesAfterLoad -= 1; - }; - this.page.onResourceRequested = function(requestData,networkRequest) { - _g.logEvent("resource_requested",{ request_data : requestData, network_request : networkRequest}); - _g.activityCounter += 1; - if(_g.pageLoaded) _g.pendingResourcesAfterLoad += 1; - }; - this.page.onResourceTimeout = function(request) { - _g.logEvent("resource_timeout",{ request : request}); - _g.activityCounter += 1; - if(_g.pageLoaded) _g.pendingResourcesAfterLoad -= 1; - }; - this.page.onUrlChanged = function(targetUrl) { - _g.logEvent("url_changed",{ target_url : targetUrl}); - }; - } - ,logEvent: function(eventName,eventData) { - if(this.eventLogFile == null) return; - var line = JSON.stringify({ timestamp : new Date().getTime() / 1000.0, event : eventName, value : eventData}); - this.eventLogFile.write(line); - this.eventLogFile.write("\n"); - } - ,logAction: function(eventName,eventData) { - if(this.actionLogFile == null) return; - var line = JSON.stringify({ timestamp : new Date().getTime() / 1000.0, event : eventName, value : eventData}); - this.actionLogFile.write(line); - this.actionLogFile.write("\n"); - } - ,loadUrl: function() { - var _g = this; - var url = Reflect.field(this.config,"url"); - console.log("Load URL " + url + "."); - this.page.open(url,function(status) { - console.log("Page loaded! " + status + "."); - _g.pageLoaded = true; - }); - this.pollPageLoad(); - } - ,pollPageLoad: function() { - console.log("Polling for load."); - if(this.pageLoaded) this.loadFinishedCallback(); else window.setTimeout($bind(this,this.pollPageLoad),100); - } - ,loadFinishedCallback: function() { - console.log("Load finished."); - if(this.isPageDynamic()) this.scrollPage(); else this.loadFinishedCallback2(); - } - ,loadFinishedCallback2: function() { - if(Reflect.field(this.config,"snapshot")) this.makeSnapshots(); - this.close(); - } - ,isPageDynamic: function() { - var result = this.page.evaluate("\n function () {\n return document.getElementsByTagName('script').length ||\n document.querySelector(\n '[onload],[onunload],[onabortonclick],[ondblclick],' +\n '[onmousedown],[onmousemove],[onmouseout],[onmouseover],' +\n '[onmouseup],[onkeydown],[onkeypress],[onkeyup]');\n }\n "); - return result; - } - ,getPageContentHeight: function() { - return this.page.evaluate("function() { return document.body.scrollHeight; }"); - } - ,scrollPage: function() { - var _g = this; - var currentY = 0; - var scrollDelay; - scrollDelay = js.Boot.__cast(Reflect.field(this.config,"wait_time") * 1000 , Int); - var numScrolls = Reflect.field(this.config,"num_scrolls"); - var smartScroll = Reflect.field(this.config,"smart_scroll"); - var startDate = null; - var clickX = this.page.viewportSize.width; - var clickY = this.page.viewportSize.height; - this.logAction("click",[clickX,clickY]); - this.sendClick(clickX,clickY); - var pollForPendingLoad; - var pollForPendingLoad1 = null; - pollForPendingLoad1 = function() { - if(startDate == null) startDate = new Date(); - var duration = new Date().getTime() - startDate.getTime(); - console.log("pendingResourcesAfterLoad=" + _g.pendingResourcesAfterLoad); - if(_g.pendingResourcesAfterLoad > 0 && duration < 60000) window.setTimeout(pollForPendingLoad1,100); else _g.loadFinishedCallback2(); - }; - pollForPendingLoad = pollForPendingLoad1; - var cleanupScroll = function() { - _g.logAction("set_scroll_left",0); - _g.logAction("set_scroll_top",0); - _g.setPagePosition(0,0); - _g.sendKey(_g.page.event.key.Home); - pollForPendingLoad(); - }; - var actualScroll; - var actualScroll1 = null; - actualScroll1 = function() { - var beforeActivityCount = _g.activityCounter; - currentY += 768; - console.log("Scroll page " + currentY + ". numScrolls=" + numScrolls + "."); - _g.logAction("set_scroll_left",0); - _g.logAction("set_scroll_top",currentY); - _g.setPagePosition(0,currentY); - _g.sendKey(_g.page.event.key.PageDown); - window.setTimeout(function() { - var pageHeight = _g.getPageContentHeight(); - if(pageHeight == null) pageHeight = 0; - console.log("before=" + beforeActivityCount + " activityCounter=" + _g.activityCounter); - console.log("currentY=" + currentY + " pageHeight=" + pageHeight); - if(smartScroll && beforeActivityCount == _g.activityCounter && currentY >= pageHeight) { - cleanupScroll(); - return; - } - numScrolls -= 1; - if(numScrolls > 0) actualScroll1(); else cleanupScroll(); - },scrollDelay); - }; - actualScroll = actualScroll1; - actualScroll(); - } - ,setPagePosition: function(x,y) { - this.page.scrollPosition = { left : x, top : y}; - this.page.evaluate("\n function () {\n if (window) {\n window.scrollTo(" + x + ", " + y + ");\n }\n }\n "); - } - ,sendClick: function(x,y,button) { - if(button == null) button = "left"; - this.page.sendEvent("mousedown",x,y,button); - this.page.sendEvent("mouseup",x,y,button); - this.page.sendEvent("click",x,y,button); - } - ,sendKey: function(key,modifier) { - if(modifier == null) modifier = 0; - this.page.sendEvent("keypress",key,null,null,modifier); - this.page.sendEvent("keydown",key,null,null,modifier); - this.page.sendEvent("keyup",key,null,null,modifier); - } - ,makeSnapshots: function() { - var paths = Reflect.field(this.config,"snapshot_paths"); - var _g = 0; - while(_g < paths.length) { - var path = paths[_g]; - ++_g; - console.log("Making snapshot " + path); - this.renderPage(path); - } - } - ,renderPage: function(path) { - if(StringTools.endsWith(path,".html")) { - var file = this.fs.open(path,"w"); - file.write(this.page.content); - file.close(); - } else this.page.render(path); - } - ,close: function() { - console.log("Closing."); - this.page.close(); - if(this.actionLogFile != null) this.actionLogFile.flush(); - if(this.eventLogFile != null) this.eventLogFile.flush(); - this.phantom.exit(); - } - ,__class__: PhantomJS -}; -var Reflect = function() { }; -Reflect.__name__ = true; -Reflect.field = function(o,field) { - try { - return o[field]; - } catch( e ) { - return null; - } -}; -Reflect.setField = function(o,field,value) { - o[field] = value; -}; -Reflect.fields = function(o) { - var a = []; - if(o != null) { - var hasOwnProperty = Object.prototype.hasOwnProperty; - for( var f in o ) { - if(f != "__id__" && f != "hx__closures__" && hasOwnProperty.call(o,f)) a.push(f); - } - } - return a; -}; -var Std = function() { }; -Std.__name__ = true; -Std.string = function(s) { - return js.Boot.__string_rec(s,""); -}; -var StringTools = function() { }; -StringTools.__name__ = true; -StringTools.endsWith = function(s,end) { - var elen = end.length; - var slen = s.length; - return slen >= elen && HxOverrides.substr(s,slen - elen,elen) == end; -}; -var js = {}; -js.Boot = function() { }; -js.Boot.__name__ = true; -js.Boot.getClass = function(o) { - if((o instanceof Array) && o.__enum__ == null) return Array; else return o.__class__; -}; -js.Boot.__string_rec = function(o,s) { - if(o == null) return "null"; - if(s.length >= 5) return "<...>"; - var t = typeof(o); - if(t == "function" && (o.__name__ || o.__ename__)) t = "object"; - switch(t) { - case "object": - if(o instanceof Array) { - if(o.__enum__) { - if(o.length == 2) return o[0]; - var str = o[0] + "("; - s += "\t"; - var _g1 = 2; - var _g = o.length; - while(_g1 < _g) { - var i = _g1++; - if(i != 2) str += "," + js.Boot.__string_rec(o[i],s); else str += js.Boot.__string_rec(o[i],s); - } - return str + ")"; - } - var l = o.length; - var i1; - var str1 = "["; - s += "\t"; - var _g2 = 0; - while(_g2 < l) { - var i2 = _g2++; - str1 += (i2 > 0?",":"") + js.Boot.__string_rec(o[i2],s); - } - str1 += "]"; - return str1; - } - var tostr; - try { - tostr = o.toString; - } catch( e ) { - return "???"; - } - if(tostr != null && tostr != Object.toString) { - var s2 = o.toString(); - if(s2 != "[object Object]") return s2; - } - var k = null; - var str2 = "{\n"; - s += "\t"; - var hasp = o.hasOwnProperty != null; - for( var k in o ) { - if(hasp && !o.hasOwnProperty(k)) { - continue; - } - if(k == "prototype" || k == "__class__" || k == "__super__" || k == "__interfaces__" || k == "__properties__") { - continue; - } - if(str2.length != 2) str2 += ", \n"; - str2 += s + k + " : " + js.Boot.__string_rec(o[k],s); - } - s = s.substring(1); - str2 += "\n" + s + "}"; - return str2; - case "function": - return ""; - case "string": - return o; - default: - return String(o); - } -}; -js.Boot.__interfLoop = function(cc,cl) { - if(cc == null) return false; - if(cc == cl) return true; - var intf = cc.__interfaces__; - if(intf != null) { - var _g1 = 0; - var _g = intf.length; - while(_g1 < _g) { - var i = _g1++; - var i1 = intf[i]; - if(i1 == cl || js.Boot.__interfLoop(i1,cl)) return true; - } - } - return js.Boot.__interfLoop(cc.__super__,cl); -}; -js.Boot.__instanceof = function(o,cl) { - if(cl == null) return false; - switch(cl) { - case Int: - return (o|0) === o; - case Float: - return typeof(o) == "number"; - case Bool: - return typeof(o) == "boolean"; - case String: - return typeof(o) == "string"; - case Array: - return (o instanceof Array) && o.__enum__ == null; - case Dynamic: - return true; - default: - if(o != null) { - if(typeof(cl) == "function") { - if(o instanceof cl) return true; - if(js.Boot.__interfLoop(js.Boot.getClass(o),cl)) return true; - } - } else return false; - if(cl == Class && o.__name__ != null) return true; - if(cl == Enum && o.__ename__ != null) return true; - return o.__enum__ == cl; - } -}; -js.Boot.__cast = function(o,t) { - if(js.Boot.__instanceof(o,t)) return o; else throw "Cannot cast " + Std.string(o) + " to " + Std.string(t); -}; -var $_, $fid = 0; -function $bind(o,m) { if( m == null ) return null; if( m.__id__ == null ) m.__id__ = $fid++; var f; if( o.hx__closures__ == null ) o.hx__closures__ = {}; else f = o.hx__closures__[m.__id__]; if( f == null ) { f = function(){ return f.method.apply(f.scope, arguments); }; f.scope = o; f.method = m; o.hx__closures__[m.__id__] = f; } return f; } -String.prototype.__class__ = String; -String.__name__ = true; -Array.__name__ = true; -Date.prototype.__class__ = Date; -Date.__name__ = ["Date"]; -var Int = { __name__ : ["Int"]}; -var Dynamic = { __name__ : ["Dynamic"]}; -var Float = Number; -Float.__name__ = ["Float"]; -var Bool = Boolean; -Bool.__ename__ = ["Bool"]; -var Class = { __name__ : ["Class"]}; -var Enum = { }; -PhantomJS.main(); -})(); diff --git a/wpull/driver/phantomjs.py b/wpull/driver/phantomjs.py deleted file mode 100644 index 6052e64f..00000000 --- a/wpull/driver/phantomjs.py +++ /dev/null @@ -1,141 +0,0 @@ -import json -import logging -import os.path -import subprocess -import tempfile - -import namedlist -import asyncio - -from wpull.driver.process import Process -import wpull.util - - -_logger = logging.getLogger(__name__) - - -PhantomJSDriverParams = namedlist.namedtuple( - 'PhantomJSDriverParamsType', [ - 'url', - ('snapshot_paths', []), - ('wait_time', 1), - ('num_scrolls', 10), - ('smart_scroll', True), - ('snapshot', True), - ('viewport_size', (1200, 1920)), - ('paper_size', (2400, 3840)), - ('event_log_filename', None), - ('action_log_filename', None), - ('custom_headers', {}), - ('page_settings', {}), - ] -) -'''PhantomJS Driver parameters - -Attributes: - url (str): URL of page to fetch. - snapshot_type (list): List of filenames. Accepted extensions are html, - pdf, png, gif. - wait_time (float): Time between page scrolls. - num_scrolls (int): Maximum number of scrolls. - smart_scroll (bool): Whether to stop scrolling if number of - requests & responses do not change. - snapshot (bool): Whether to take snapshot files. - viewport_size (tuple): Width and height of the page viewport. - paper_size (tuple): Width and height of the paper size. - event_log_filename (str): Path to save page events. - action_log_filename (str): Path to save page action manipulation events. - custom_headers (dict): Custom HTTP request headers. - page_settings (dict): Page settings. -''' - - -class PhantomJSDriver(Process): - '''PhantomJS processing. - - Args: - exe_path (str): Path of the PhantomJS executable. - extra_args (list): Additional arguments for PhantomJS. Most likely, - you'll want to pass proxy settings for capturing traffic. - params (:class:`PhantomJSDriverParams`): Parameters for controlling - the processing pipeline. - - This class launches PhantomJS that scrolls and saves snapshots. It can - only be used once per URL. - ''' - def __init__(self, exe_path='phantomjs', extra_args=None, params=None): - script_path = wpull.util.get_package_filename('driver/phantomjs.js') - - self._config_file = tempfile.NamedTemporaryFile( - prefix='tmp-wpull-', suffix='.json', delete=False - ) - - args = [exe_path] + (extra_args or []) + [script_path, self._config_file.name] - super().__init__(args, stderr_callback=self._stderr_callback) - - self._params = params - - @asyncio.coroutine - def _stderr_callback(self, line): - _logger.warning(line.decode('utf-8', 'replace').rstrip()) - - @asyncio.coroutine - def start(self, use_atexit=True): - _logger.debug('PhantomJS start.') - - self._write_config() - - yield from super().start(use_atexit) - - def _write_config(self): - '''Write the parameters to a file for PhantomJS to read.''' - param_dict = { - 'url': self._params.url, - 'snapshot_paths': self._params.snapshot_paths, - 'wait_time': self._params.wait_time, - 'num_scrolls': self._params.num_scrolls, - 'smart_scroll': self._params.smart_scroll, - 'snapshot': self._params.snapshot, - 'viewport_width': self._params.viewport_size[0], - 'viewport_height': self._params.viewport_size[1], - 'paper_width': self._params.paper_size[0], - 'paper_height': self._params.paper_size[1], - 'custom_headers': self._params.custom_headers, - 'page_settings': self._params.page_settings, - } - - if self._params.event_log_filename: - param_dict['event_log_filename'] = \ - os.path.abspath(self._params.event_log_filename) - - if self._params.action_log_filename: - param_dict['action_log_filename'] = \ - os.path.abspath(self._params.action_log_filename) - - config_text = json.dumps(param_dict) - - self._config_file.write(config_text.encode('utf-8')) - - # Close it so the phantomjs process can read it on Windows - self._config_file.close() - - def close(self): - _logger.debug('Terminate phantomjs process.') - super().close() - - if os.path.exists(self._config_file.name): - os.remove(self._config_file.name) - - -def get_version(exe_path='phantomjs'): - '''Get the version string of PhantomJS.''' - process = subprocess.Popen( - [exe_path, '--version'], - stdout=subprocess.PIPE - ) - version_string = process.communicate()[0] - version_string = version_string.decode().strip() - - assert ' ' not in version_string, version_string - - return version_string diff --git a/wpull/driver/phantomjs_test.py b/wpull/driver/phantomjs_test.py deleted file mode 100644 index c812f092..00000000 --- a/wpull/driver/phantomjs_test.py +++ /dev/null @@ -1,55 +0,0 @@ -import contextlib -import os - -from wpull.driver.phantomjs import PhantomJSDriver, PhantomJSDriverParams -from wpull.testing.goodapp import GoodAppTestCase -import wpull.testing.async -from wpull.testing.util import TempDirMixin - - -DEFAULT_TIMEOUT = 30 - - -class TestPhantomJS(GoodAppTestCase, TempDirMixin): - def setUp(self): - super().setUp() - self.set_up_temp_dir() - - def tearDown(self): - super().tearDown() - self.tear_down_temp_dir() - - @wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT) - def test_driver(self): - params = PhantomJSDriverParams( - self.get_url('/static/DEUUEAUGH.html'), - snapshot_paths=['test.png', 'test.pdf', 'test.html'], - event_log_filename='event.log', - action_log_filename='action.log', - wait_time=0.2, - custom_headers={ - 'X-Doge': 'Wow' - }, - page_settings={ - 'resourceTimeout': 1000 - } - ) - - driver = PhantomJSDriver(params=params) - - yield from driver.start() - yield from driver.process.wait() - - self.assertEqual(0, driver.process.returncode) - - self.assertTrue(os.path.isfile('test.png')) - self.assertGreater(os.path.getsize('test.png'), 100) - self.assertTrue(os.path.isfile('test.pdf')) - self.assertGreater(os.path.getsize('test.pdf'), 100) - self.assertTrue(os.path.isfile('test.html')) - self.assertGreater(os.path.getsize('test.html'), 100) - - self.assertTrue(os.path.isfile('action.log')) - self.assertGreater(os.path.getsize('action.log'), 100) - self.assertTrue(os.path.isfile('event.log')) - self.assertGreater(os.path.getsize('event.log'), 100) diff --git a/wpull/processor/coprocessor/phantomjs.py b/wpull/processor/coprocessor/phantomjs.py deleted file mode 100644 index fb1af12e..00000000 --- a/wpull/processor/coprocessor/phantomjs.py +++ /dev/null @@ -1,329 +0,0 @@ -'''PhantomJS page loading and scrolling.''' -import contextlib -import copy -import gettext -import json -import logging -import os -import tempfile -import io - -import namedlist -import asyncio - -from typing import Callable - -from wpull.backport.logging import BraceMessage as __ -from wpull.document.html import HTMLReader -from wpull.body import Body -from wpull.driver.phantomjs import PhantomJSDriverParams, PhantomJSDriver -from wpull.namevalue import NameValueRecord -from wpull.pipeline.session import ItemSession -from wpull.processor.rule import ProcessingRule -from wpull.warc.format import WARCRecord -import wpull.url - - -PhantomJSParams = namedlist.namedtuple( - 'PhantomJSParamsType', [ - ('snapshot_types', ('html', 'pdf')), - ('wait_time', 1), - ('num_scrolls', 10), - ('smart_scroll', True), - ('snapshot', True), - ('viewport_size', (1200, 1920)), - ('paper_size', (2400, 3840)), - ('load_time', 900), - ('custom_headers', {}), - ('page_settings', {}), - ] -) -'''PhantomJS parameters - -Attributes: - snapshot_type (list): File types. Accepted are html, pdf, png, gif. - wait_time (float): Time between page scrolls. - num_scrolls (int): Maximum number of scrolls. - smart_scroll (bool): Whether to stop scrolling if number of - requests & responses do not change. - snapshot (bool): Whether to take snapshot files. - viewport_size (tuple): Width and height of the page viewport. - paper_size (tuple): Width and height of the paper size. - load_time (float): Maximum time to wait for page load. - custom_headers (dict): Default HTTP headers. - page_settings (dict): Page settings. -''' - - -_logger = logging.getLogger(__name__) -_ = gettext.gettext - - -class PhantomJSCrashed(Exception): - '''PhantomJS exited with non-zero code.''' - - -class PhantomJSCoprocessor(object): - '''PhantomJS coprocessor. - - Args: - phantomjs_driver_factory: Callback function that accepts ``params`` - argument and returns PhantomJSDriver - processing_rule: Processing - rule. - warc_recorder: WARC recorder. - root_dir (str): Root directory path for temp files. - ''' - def __init__(self, phantomjs_driver_factory: Callable[..., PhantomJSDriver], - processing_rule: ProcessingRule, - phantomjs_params: PhantomJSParams, - warc_recorder=None, root_path='.'): - self._phantomjs_driver_factory = phantomjs_driver_factory - self._processing_rule = processing_rule - self._phantomjs_params = phantomjs_params - self._warc_recorder = warc_recorder - self._root_path = root_path - - self._file_writer_session = None - - @asyncio.coroutine - def process(self, item_session: ItemSession, request, response, file_writer_session): - '''Process PhantomJS. - - Coroutine. - ''' - if response.status_code != 200: - return - - if not HTMLReader.is_supported(request=request, response=response): - return - - _logger.debug('Starting PhantomJS processing.') - - self._file_writer_session = file_writer_session - - # FIXME: this is a quick hack for crashes. See #137. - attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) - - for dummy in range(attempts): - try: - yield from self._run_driver(item_session, request, response) - except asyncio.TimeoutError: - _logger.warning(_('Waiting for page load timed out.')) - break - except PhantomJSCrashed as error: - _logger.exception(__('PhantomJS crashed: {}', error)) - else: - break - else: - _logger.warning(__( - _('PhantomJS failed to fetch ‘{url}’. I am sorry.'), - url=request.url_info.url - )) - - @asyncio.coroutine - def _run_driver(self, item_session: ItemSession, request, response): - '''Start PhantomJS processing.''' - _logger.debug('Started PhantomJS processing.') - - session = PhantomJSCoprocessorSession( - self._phantomjs_driver_factory, self._root_path, - self._processing_rule, self._file_writer_session, - request, response, - item_session, self._phantomjs_params, self._warc_recorder - ) - - with contextlib.closing(session): - yield from session.run() - - _logger.debug('Ended PhantomJS processing.') - - -class PhantomJSCoprocessorSession(object): - '''PhantomJS coprocessor session.''' - def __init__(self, phantomjs_driver_factory, root_path, - processing_rule, file_writer_session, - request, response, - item_session: ItemSession, params, warc_recorder): - self._phantomjs_driver_factory = phantomjs_driver_factory - self._root_path = root_path - self._processing_rule = processing_rule - self._file_writer_session = file_writer_session - self._request = request - self._response = response - self._item_session = item_session - self._params = params - self._warc_recorder = warc_recorder - self._temp_filenames = [] - self._action_warc_record = None - - @asyncio.coroutine - def run(self): - scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html') - action_log_path = self._get_temp_path('phantom-action', suffix='.txt') - event_log_path = self._get_temp_path('phantom-event', suffix='.txt') - snapshot_paths = [scrape_snapshot_path] - snapshot_paths.extend(self._get_snapshot_paths()) - url = self._item_session.url_record.url - - driver_params = PhantomJSDriverParams( - url=url, - snapshot_paths=snapshot_paths, - wait_time=self._params.wait_time, - num_scrolls=self._params.num_scrolls, - smart_scroll=self._params.smart_scroll, - snapshot=self._params.snapshot, - viewport_size=self._params.viewport_size, - paper_size=self._params.paper_size, - event_log_filename=event_log_path, - action_log_filename=action_log_path, - custom_headers=self._params.custom_headers, - page_settings=self._params.page_settings, - ) - - driver = self._phantomjs_driver_factory(params=driver_params) - - _logger.info(__( - _('PhantomJS fetching ‘{url}’.'), - url=url - )) - - with contextlib.closing(driver): - yield from driver.start() - - # FIXME: we don't account that things might be scrolling and - # downloading so it might not be a good idea to timeout like - # this - if self._params.load_time: - yield from asyncio.wait_for( - driver.process.wait(), self._params.load_time - ) - else: - yield from driver.process.wait() - - if driver.process.returncode != 0: - raise PhantomJSCrashed( - 'PhantomJS exited with code {}' - .format(driver.process.returncode) - ) - - if self._warc_recorder: - self._add_warc_action_log(action_log_path, url) - for path in snapshot_paths: - self._add_warc_snapshot(path, url) - - _logger.info(__( - _('PhantomJS fetched ‘{url}’.'), - url=url - )) - - def _get_temp_path(self, hint, suffix='.tmp'): - temp_fd, temp_path = tempfile.mkstemp( - dir=self._root_path, prefix='tmp-wpull-{}'.format(hint), suffix=suffix - ) - os.close(temp_fd) - self._temp_filenames.append(temp_path) - - return temp_path - - def _get_snapshot_paths(self, infix='snapshot'): - for snapshot_type in self._params.snapshot_types or (): - path = self._file_writer_session.extra_resource_path( - '.{infix}.{file_type}' - .format(infix=infix, file_type=snapshot_type) - ) - - if not path: - temp_fd, temp_path = tempfile.mkstemp( - dir=self._root_path, prefix='tmp-phnsh', - suffix='.{}'.format(snapshot_type) - ) - os.close(temp_fd) - path = temp_path - self._temp_filenames.append(temp_path) - - yield path - - def _add_warc_action_log(self, path, url): - '''Add the action log to the WARC file.''' - _logger.debug('Adding action log record.') - - actions = [] - with open(path, 'r', encoding='utf-8', errors='replace') as file: - for line in file: - actions.append(json.loads(line)) - - log_data = json.dumps( - {'actions': actions}, - indent=4, - ).encode('utf-8') - - self._action_warc_record = record = WARCRecord() - record.set_common_fields('metadata', 'application/json') - record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \ - .format(wpull.url.percent_encode_query_value(url)) - record.block_file = io.BytesIO(log_data) - - self._warc_recorder.set_length_and_maybe_checksums(record) - self._warc_recorder.write_record(record) - - def _add_warc_snapshot(self, filename, url): - '''Add the snaphot to the WARC file.''' - _logger.debug('Adding snapshot record.') - - extension = os.path.splitext(filename)[1] - content_type = { - '.pdf': 'application/pdf', - '.html': 'text/html', - '.png': 'image/png', - '.gif': 'image/gif' - }[extension] - - record = WARCRecord() - record.set_common_fields('resource', content_type) - record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \ - .format(wpull.url.percent_encode_query_value(url)) - - if self._action_warc_record: - record.fields['WARC-Concurrent-To'] = \ - self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID] - - with open(filename, 'rb') as in_file: - record.block_file = in_file - - self._warc_recorder.set_length_and_maybe_checksums(record) - self._warc_recorder.write_record(record) - - def _scrape_document(self): - '''Extract links from the DOM.''' - mock_response = self._new_mock_response( - self._response, self._get_temp_path('phantom', '.html') - ) - - self._item_session.request = self._request - self._item_session.response = mock_response - - self._processing_rule.scrape_document(item_session) - - if mock_response.body: - mock_response.body.close() - - def _new_mock_response(self, response, file_path): - '''Return a new mock Response with the content.''' - mock_response = copy.copy(response) - - mock_response.body = Body(open(file_path, 'rb')) - mock_response.fields = NameValueRecord() - - for name, value in response.fields.get_all(): - mock_response.fields.add(name, value) - - mock_response.fields['Content-Type'] = 'text/html; charset="utf-8"' - - return mock_response - - def close(self): - '''Clean up.''' - for path in self._temp_filenames: - if os.path.exists(path): - os.remove(path) diff --git a/wpull/processor/web.py b/wpull/processor/web.py index 3ccc1ea2..8627a8a5 100644 --- a/wpull/processor/web.py +++ b/wpull/processor/web.py @@ -15,7 +15,6 @@ from wpull.application.hook import HookableMixin, Actions from wpull.pipeline.item import URLRecord from wpull.pipeline.session import ItemSession -from wpull.processor.coprocessor.phantomjs import PhantomJSCoprocessor from wpull.processor.coprocessor.youtubedl import YoutubeDlCoprocessor from wpull.protocol.http.request import Request, Response from wpull.protocol.http.web import LoopType, WebClient @@ -449,14 +448,6 @@ def _close_instance_body(self, instance): instance.body.close() def _run_coprocessors(self, request: Request, response: Response): - phantomjs_coprocessor = self._item_session.app_session.factory.get('PhantomJSCoprocessor') - - if phantomjs_coprocessor: - phantomjs_coprocessor = cast(PhantomJSCoprocessor, phantomjs_coprocessor) - yield from phantomjs_coprocessor.process( - self._item_session, request, response, self._file_writer_session - ) - youtube_dl_coprocessor = self._item_session.app_session.factory.get('YoutubeDlCoprocessor') if youtube_dl_coprocessor: diff --git a/wpull/testing/integration/phantomjs_test.py b/wpull/testing/integration/phantomjs_test.py deleted file mode 100644 index 74d155f4..00000000 --- a/wpull/testing/integration/phantomjs_test.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import unittest - -from wpull.application.builder import Builder -from wpull.application.options import AppArgumentParser -from wpull.testing.integration.base import HTTPGoodAppTestCase -import wpull.testing.async -from wpull.testing.integration.http_app_test import MockDNSResolver -from wpull.testing.util import TempDirMixin -from wpull.util import IS_PYPY - - -class PhantomJSMixin(object): - # FIXME: it stopped working in Travis for a while - @unittest.skipIf(os.environ.get('TRAVIS'), 'Broken under Travis CI') - @wpull.testing.async.async_test() - def test_app_phantomjs(self): - arg_parser = AppArgumentParser() - script_filename = os.path.join(os.path.dirname(__file__), - 'sample_user_scripts', 'boring.plugin.py') - - # Change localhost into something else to test proxy - args = arg_parser.parse_args([ - self.get_url('/static/simple_javascript.html').replace('localhost', 'example.invalid'), - '--warc-file', 'test', - '--no-warc-compression', - '-4', - '--no-robots', - '--phantomjs', - '--phantomjs-exe', 'phantomjs', - '--phantomjs-wait', '0.1', - '--phantomjs-scroll', '2', - '--header', 'accept-language: dragon', - '--plugin-script', script_filename, - '--no-check-certificate', - ]) - builder = Builder(args, unit_test=True) - builder.factory.class_map['Resolver'] = MockDNSResolver - - app = builder.build() - exit_code = yield from app.run() - - self.assertTrue(os.path.exists('test.warc')) - self.assertTrue( - os.path.exists('simple_javascript.html.snapshot.html') - ) - self.assertTrue( - os.path.exists('simple_javascript.html.snapshot.pdf') - ) - - with open('simple_javascript.html.snapshot.html', 'rb') as in_file: - data = in_file.read() - self.assertIn(b'Hello world!', data) - - with open('test.warc', 'rb') as in_file: - data = in_file.read() - - self.assertIn(b'urn:X-wpull:snapshot?url=', data) - self.assertIn(b'text/html', data) - self.assertIn(b'application/pdf', data) - self.assertIn(b'application/json', data) - self.assertIn(b'"set_scroll_top"', data) - try: - self.assertIn(b'Accept-Encoding: identity', data) - except AssertionError: - # webkit treats localhost differently - self.assertNotIn(b'Accept-Encoding: gzip', data) - self.assertIn(b'Accept-Language: dragon', data) - - self.assertEqual(0, exit_code) - self.assertGreaterEqual(builder.factory['Statistics'].files, 1) - - @unittest.skipIf(os.environ.get('TRAVIS'), 'Broken under Travis CI') - @wpull.testing.async.async_test( - timeout=30 * 3 if IS_PYPY else 30 - ) - def test_app_phantomjs_scroll(self): - arg_parser = AppArgumentParser() - - # Change localhost into something else to test proxy - args = arg_parser.parse_args([ - self.get_url('/static/DEUUEAUGH.html').replace('localhost', 'example.invalid'), - '-4', - '--no-robots', - '--phantomjs', - '--phantomjs-wait', '0.4', - '--phantomjs-scroll', '20', - '--no-check-certificate', - ]) - builder = Builder(args, unit_test=True) - builder.factory.class_map['Resolver'] = MockDNSResolver - - app = builder.build() - exit_code = yield from app.run() - - with open('DEUUEAUGH.html.snapshot.html', 'rb') as in_file: - data = in_file.read() - self.assertIn(b'Count: 10', data) - - self.assertEqual(0, exit_code) - - -class TestPhantomJS(HTTPGoodAppTestCase, PhantomJSMixin): - pass - - -class TestPhantomJSHTTPS(HTTPGoodAppTestCase, PhantomJSMixin, TempDirMixin): - pass