In order to verify which resources were received, we need to rely on the onResourceReceived
event of a webpage in PhantomJS. This callback is invoked when a resource requested by the page is received. The only argument to the callback is the response metadata object. If the resource is large and sent by the server in multiple chunks, onResourceReceived
will be invoked for every chunk received by PhantomJS. In this event all the web resources like images, fonts, style sheets and scripts will be listed.
PhantomJS (Chromium, as PhantomJS is a headless WebKit) uses a multi-process resource loading method. All network communication is handled by the main browser process. This is done not only so that the browser process can control each renderer's access to the network, but also so that it can maintain consistent session state across processes like cookies and cached data. It is also important because as a HTTP/1.1 user-agent, the browser as a whole should not open too many connections per host.
Check which resources were received
var webpage = require('webpage');
var page = webpage.create();
var websiteToCheck = "https://github.com";
page.open(websiteToCheck, function() {
phantom.exit();
});
page.onResourceReceived = function(response) {
console.log(response.url);
};
To check how this works, save the previous code in a script (index.js
) and execute it with phantom using phantomjs index.js
. The code will generate the following output:
https://github.com/
https://github.com/
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js
If you are aware, you've probably noticed that there are some resources that are in the list twice. To prevent this behaviour, you need to check wheter the resource is in the start stage with the stage property in the response object. The stage property has 2 possible values, start
that gives the first byte arrived time and end
that is set when you got the complete response. To show a resource only once, add a conditional statement in the onResourceReceived
event:
var webpage = require('webpage');
var page = webpage.create();
var websiteToCheck = "https://github.com";
page.open(websiteToCheck, function() {
phantom.exit();
});
page.onResourceReceived = function(response) {
// Skip resource if already in queue
if(response.stage == 'end'){
return;
}
console.log(response.url);
};
That should print now in the console:
https://github.com/
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn
Structure of a response object in the onResourceReceived event
The response
metadata object received as first argument in the callback contains these properties:
id
: the number of the requested resourceurl
: the URL of the requested resourcetime
: Date object containing the date of the responseheaders
: list of http headersbodySize
: size of the received content decompressed (entire content or chunk content)contentType
: the content type if specifiedredirectURL
: if there is a redirection, the redirected URLstage
: "start", "end" (FIXME: other value for intermediate chunk?)status
: http status code. ex:200
statusText
: http status text. ex:OK
Every response object shown in the event has the following structure:
{
"body":"",
"bodySize":4714,
"contentType":"text/html; charset=utf-8",
"headers":[
{
"name":"Server",
"value":"GitHub.com"
},
{
"name":"Date",
"value":"Thu, 09 Feb 2017 12:35:38 GMT"
},
{
"name":"Content-Type",
"value":"text/html; charset=utf-8"
},
{
"name":"Transfer-Encoding",
"value":"chunked"
},
{
"name":"Status",
"value":"200 OK"
},
{
"name":"Cache-Control",
"value":"no-cache"
},
{
"name":"X-UA-Compatible",
"value":"IE=Edge,chrome=1"
},
{
"name":"Set-Cookie",
"value":"logged_in=no; domain=.github.com; path=/; expires=Mon, 09 Feb 2037 12:35:38 -0000; secure; HttpOnly\n_gh_sess=eyJzZXNzaW9uX2lkIjoiMTQ2Y2VjOTM2YWY2MTIwYzZkZGRmNGI0NzY5MGQ1YTAiLCJfY3NyZl90b2tlbiI6IkEwc1BxQlNYTndyWm9oUFh1aDIxWGlBOE5ZNmlCbnE0cjJ1K0JldUNJaFU9In0%3D--9f27661358a0c06e16dc86f7a085b33263f5633e; path=/; secure; HttpOnly"
},
{
"name":"X-Request-Id",
"value":"fef18cf6da42783a3a5ad53b876bb153"
},
{
"name":"X-Runtime",
"value":"0.039490"
},
{
"name":"Content-Security-Policy",
"value":"default-src 'none'; connect-src 'self' uploads.github.com status.github.com collector.githubapp.com api.github.com www.google-analytics.com github-cloud.s3.amazonaws.com wss://live.github.com; font-src assets-cdn.github.com; frame-src render.githubusercontent.com; img-src 'self' data: assets-cdn.github.com identicons.github.com collector.githubapp.com github-cloud.s3.amazonaws.com *.githubusercontent.com; media-src 'none'; script-src assets-cdn.github.com; style-src 'unsafe-inline' assets-cdn.github.com"
},
{
"name":"Strict-Transport-Security",
"value":"max-age=31536000; includeSubdomains; preload"
},
{
"name":"Public-Key-Pins",
"value":"max-age=5184000; pin-sha256=\"WoiWRyIOVNa9ihaBciRSC7XHjliYS9VwUGOIud4PB18=\"; pin-sha256=\"RRM1dGqnDFsCJXBTHky16vi1obOlCgFFn/yOhI/y+ho=\"; pin-sha256=\"k2v657xBsOVe1PQRwOsHsw3bsGT2VzIqz5K+59sNQws=\"; pin-sha256=\"K87oWBWM9UZfyddvDfoxL+8lpNyoUB2ptGtn0fv6G2Q=\"; pin-sha256=\"IQBnNBEiFuhj+8x6X8XLgh01V9Ic5/V3IRQLNFFc7v4=\"; pin-sha256=\"iie1VXtL7HzAMF+/PVPR9xzT80kQxdZeJ+zduCB3uj0=\"; pin-sha256=\"LvRiGEjRqfzurezaWuj8Wie2gyHMrW5Q06LspMnox7A=\"; includeSubDomains"
},
{
"name":"X-Content-Type-Options",
"value":"nosniff"
},
{
"name":"X-Frame-Options",
"value":"deny"
},
{
"name":"X-XSS-Protection",
"value":"1; mode=block"
},
{
"name":"Vary",
"value":"X-PJAX, Accept-Encoding"
},
{
"name":"X-Served-By",
"value":"1868c9f28a71d80b2987f48dbd1824a0"
},
{
"name":"Content-Encoding",
"value":"gzip"
},
{
"name":"X-GitHub-Request-Id",
"value":"D86F:6207:1645007:23F3A82:589C6219"
}
],
"id":1,
"redirectURL":null,
"stage":"start",
"status":200,
"statusText":"OK",
"time":"2017-02-09T12:35:37.537Z",
"url":"https://github.com/"
}
You can use this feature to copy a website and download all the resources locally.
Happy coding !