Learn how to check which web resources (images, css, js, html and fonts) are requested by PhantomJS from a website

How to check which web resources were received with PhantomJS

In order to verify which resources were received, we need to rely on the onResourceReceived event of a webpage in PhantomJS. This callback is invoked when a resource requested by the page is received. The only argument to the callback is the response metadata object. If the resource is large and sent by the server in multiple chunks, onResourceReceived will be invoked for every chunk received by PhantomJS. In this event all the web resources like images, fonts, style sheets and scripts will be listed.

PhantomJS (Chromium, as PhantomJS is a headless WebKit) uses a multi-process resource loading method. All network communication is handled by the main browser process. This is done not only so that the browser process can control each renderer's access to the network, but also so that it can maintain consistent session state across processes like cookies and cached data. It is also important because as a HTTP/1.1 user-agent, the browser as a whole should not open too many connections per host.

Check which resources were received

var webpage = require('webpage');
var page = webpage.create();
var websiteToCheck = "https://github.com";

page.open(websiteToCheck, function() {
    phantom.exit();
});

page.onResourceReceived = function(response) {
    console.log(response.url);
};

To check how this works, save the previous code in a script (index.js) and execute it with phantom using phantomjs index.js. The code will generate the following output:

https://github.com/
https://github.com/
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js

If you are aware, you've probably noticed that there are some resources that are in the list twice. To prevent this behaviour, you need to check wheter the resource is in the start stage with the stage property in the response object. The stage property has 2 possible values, start that gives the first byte arrived time and end that is set when you got the complete response. To show a resource only once, add a conditional statement in the onResourceReceived event:

var webpage = require('webpage');
var page = webpage.create();
var websiteToCheck = "https://github.com";

page.open(websiteToCheck, function() {
    phantom.exit();
});

page.onResourceReceived = function(response) {
    // Skip resource if already in queue
    if(response.stage == 'end'){
        return;
    }
    
    console.log(response.url);
};

That should print now in the console:

https://github.com/
https://assets-cdn.github.com/assets/frameworks-a44e0bdd1666101af23963e4027cd7a0a1eea1339e0e7422524f2e7f3900e86b.css
https://assets-cdn.github.com/assets/github-ac9c637b29122a4699fcd4d205b2d09efa4d4962d369158f7d907123061143f1.css
https://assets-cdn.github.com/assets/site-052f19062b5cc9c804bcfe6c835ee11a90f898e7524d1609f639301a5eb7cd1d.css
https://assets-cdn.github.com/images/modules/site/inform-globe-transparent.svg
https://assets-cdn.github.com/images/modules/site/home-ill-build.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-work.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-projects.png?sn
https://assets-cdn.github.com/images/modules/site/home-ill-platform.png?sn
https://assets-cdn.github.com/images/modules/site/org_example_nasa.png?sn
https://assets-cdn.github.com/assets/compat-8e19569aacd39e737a14c8515582825f3c90d1794c0e5539f9b525b8eb8b5a8e.js
https://assets-cdn.github.com/assets/frameworks-a631ecd079e91d27e8c4826bced857c2e359148f6e4139c2485ee4eaf6e8b493.js
https://assets-cdn.github.com/assets/github-e34181e8d9bc6f988dd7ed883775106306f940b87ad55ff9dee30c7014b3d596.js
https://assets-cdn.github.com/images/modules/site/home-hero-sm.jpg?sn

Structure of a response object in the onResourceReceived event

The response metadata object received as first argument in the callback contains these properties:

  • id : the number of the requested resource
  • url : the URL of the requested resource
  • time : Date object containing the date of the response
  • headers : list of http headers
  • bodySize : size of the received content decompressed (entire content or chunk content)
  • contentType : the content type if specified
  • redirectURL : if there is a redirection, the redirected URL
  • stage : "start", "end" (FIXME: other value for intermediate chunk?)
  • status : http status code. ex: 200
  • statusText : http status text. ex: OK

Every response object shown in the event has the following structure:

{  
   "body":"",
   "bodySize":4714,
   "contentType":"text/html; charset=utf-8",
   "headers":[  
      {  
         "name":"Server",
         "value":"GitHub.com"
      },
      {  
         "name":"Date",
         "value":"Thu, 09 Feb 2017 12:35:38 GMT"
      },
      {  
         "name":"Content-Type",
         "value":"text/html; charset=utf-8"
      },
      {  
         "name":"Transfer-Encoding",
         "value":"chunked"
      },
      {  
         "name":"Status",
         "value":"200 OK"
      },
      {  
         "name":"Cache-Control",
         "value":"no-cache"
      },
      {  
         "name":"X-UA-Compatible",
         "value":"IE=Edge,chrome=1"
      },
      {  
         "name":"Set-Cookie",
         "value":"logged_in=no; domain=.github.com; path=/; expires=Mon, 09 Feb 2037 12:35:38 -0000; secure; HttpOnly\n_gh_sess=eyJzZXNzaW9uX2lkIjoiMTQ2Y2VjOTM2YWY2MTIwYzZkZGRmNGI0NzY5MGQ1YTAiLCJfY3NyZl90b2tlbiI6IkEwc1BxQlNYTndyWm9oUFh1aDIxWGlBOE5ZNmlCbnE0cjJ1K0JldUNJaFU9In0%3D--9f27661358a0c06e16dc86f7a085b33263f5633e; path=/; secure; HttpOnly"
      },
      {  
         "name":"X-Request-Id",
         "value":"fef18cf6da42783a3a5ad53b876bb153"
      },
      {  
         "name":"X-Runtime",
         "value":"0.039490"
      },
      {  
         "name":"Content-Security-Policy",
         "value":"default-src 'none'; connect-src 'self' uploads.github.com status.github.com collector.githubapp.com api.github.com www.google-analytics.com github-cloud.s3.amazonaws.com wss://live.github.com; font-src assets-cdn.github.com; frame-src render.githubusercontent.com; img-src 'self' data: assets-cdn.github.com identicons.github.com collector.githubapp.com github-cloud.s3.amazonaws.com *.githubusercontent.com; media-src 'none'; script-src assets-cdn.github.com; style-src 'unsafe-inline' assets-cdn.github.com"
      },
      {  
         "name":"Strict-Transport-Security",
         "value":"max-age=31536000; includeSubdomains; preload"
      },
      {  
         "name":"Public-Key-Pins",
         "value":"max-age=5184000; pin-sha256=\"WoiWRyIOVNa9ihaBciRSC7XHjliYS9VwUGOIud4PB18=\"; pin-sha256=\"RRM1dGqnDFsCJXBTHky16vi1obOlCgFFn/yOhI/y+ho=\"; pin-sha256=\"k2v657xBsOVe1PQRwOsHsw3bsGT2VzIqz5K+59sNQws=\"; pin-sha256=\"K87oWBWM9UZfyddvDfoxL+8lpNyoUB2ptGtn0fv6G2Q=\"; pin-sha256=\"IQBnNBEiFuhj+8x6X8XLgh01V9Ic5/V3IRQLNFFc7v4=\"; pin-sha256=\"iie1VXtL7HzAMF+/PVPR9xzT80kQxdZeJ+zduCB3uj0=\"; pin-sha256=\"LvRiGEjRqfzurezaWuj8Wie2gyHMrW5Q06LspMnox7A=\"; includeSubDomains"
      },
      {  
         "name":"X-Content-Type-Options",
         "value":"nosniff"
      },
      {  
         "name":"X-Frame-Options",
         "value":"deny"
      },
      {  
         "name":"X-XSS-Protection",
         "value":"1; mode=block"
      },
      {  
         "name":"Vary",
         "value":"X-PJAX, Accept-Encoding"
      },
      {  
         "name":"X-Served-By",
         "value":"1868c9f28a71d80b2987f48dbd1824a0"
      },
      {  
         "name":"Content-Encoding",
         "value":"gzip"
      },
      {  
         "name":"X-GitHub-Request-Id",
         "value":"D86F:6207:1645007:23F3A82:589C6219"
      }
   ],
   "id":1,
   "redirectURL":null,
   "stage":"start",
   "status":200,
   "statusText":"OK",
   "time":"2017-02-09T12:35:37.537Z",
   "url":"https://github.com/"
}

You can use this feature to copy a website and download all the resources locally.

Happy coding !


Senior Software Engineer at Software Medico. Interested in programming since he was 14 years old, Carlos is a self-taught programmer and founder and author of most of the articles at Our Code World.

Sponsors