Web scraping with Node

Follow Along
http://j.mp/nicar-13-node
    

Schedule

1. Set up the room
2. Simple Sites
3. Forms
4. Scraping Evaluated Source

Set up the room.

brew install node
npm install -g jsdom
npm install -g request
npm install -g zombie

(or, windows installer: http://nodejs.org/download/)
    
add node & modules path to your .profile or .bashrc

export NODE_PATH="/usr/local/lib/node: \
      /usr/local/lib/node_modules: \
      /usr/local/share/npm/lib/node_modules"

The console

$ node
var request = require('request');
//=> undefined
// (it worked)

Simple Sites

var fs       = require('fs');
var request  = require('request');
var jsdom    = require('jsdom').jsdom;
      

Get a web page.

var get = function(url, cb) {
  request(url, function (error, response, body) {
    if (!error && response.statusCode == 200) {
      cb(body);
    }
  });
};
          

Get a (jQueryified) DOM

var createDocument = function(html, cb) {
  var document = jsdom(html);
  var window   = document.createWindow();
  jsdom.jQueryify(window, cb);
};
              

Scrape.

var scrape = function(window) {
  // do stuff with your window
};  

So far:

get(url, function(body) {
  createDocument(body, function(window) {
    // do stuff with your window
  });
});

FEC.gov

var url = "http://query.nictusa.com/cgi-bin/dcdev" +
          "/forms/C00490045/766953/sa/11AI";

get(url, function(body) {
  createDocument(body, function(window) {
   
    // your scraper
    var $      = window.$;
    var json   = [];
    var donors = $("table tr td[align=LEFT]");
    
    donors.each(function() {
      var name = $(this).children("b").text();
      json.push(name);
    });    

  });
});

#=> Marc Andreessen
    Michael Armstrong
    Alan Ashton
    Lynda Balfour
    ...

Save it as JSON.

jsonStr = JSON.stringify(json);
fs.writeFileSync("donors.json", jsonStr);
  
["Marc Andreessen","Michael Armstrong","Alan Ashton",
"Lynda Balfour","Frank Baxter","Glen Beck",
"James Berardinelli","Andrew Bernstein",
"Robert D Beyer","Lynn Booth","David Bradford",
"James Brown","Frank Bruno","Buena Vista Investments LLC",...]
  
Putting it together:
http://j.mp/VaMdb1

Forms

Sometimes you need to interact with web pages to get data out

Goal: Let's grab all the FEC-registered committees with Obama in their names

The form:
http://www.fec.gov/finance/disclosure/efile_search.shtml

We'll write a script to fill this out, and POST it:


Grab the H4's

Zombie.js

var fs      = require("fs");
var Browser = require("zombie");

Set up Zombie

var browser = new Browser();

// use this unless you need evaluated source
browser.runScripts = false;

// because the FEC's site is so slow
browser.maxWait    = 30000;
browser.waitFor    = 30000;

1. Visit the page
2. Fill out the form
3. Press Button, Receive Data

// ZombieJS methods for 
// interacting with forms
browser.visit()
browser.fill()
browser.pressButton()
var url = "http://www.fec.gov/finance/" +
          "disclosure/efile_search.shtml"

browser.visit(url, function (e, browser, status) {
  browser.
    fill("name", "Obama").
    pressButton("submit", function() {
      var committees = browser.queryAll("h4 a");
      committees.forEach(function(com) {
        var cmte = com.innerHTML.split(/\s-\s/);
        json[cmte[1]] = cmte[0];
      });
      jsonStr = JSON.stringify(json);
      fs.writeFileSync("obama-cmtes.json", jsonStr);
    });
});
Follow along:
http://j.mp/14UDZEo

Result

{
    "C00505784": "AFRICA FOR OBAMA (AFO)",
    "C00527127": "AMERICA VS OBAMA",
    "C00496729": "BEAT OBAMA POLITICAL ACTION COMMITTEE",
    "C00446179": "KOREAN AMERICANS FOR OBAMA",
    "C00431445": "OBAMA FOR AMERICA",
    "C00431130": "OBAMA FOR AMERICA DRAFT COMMITTEE",
    "C00347583": "OBAMA FOR CONGRESS 2000",
    "C00451393": "OBAMA VICTORY FUND",
    "C00494740": "OBAMA VICTORY FUND 2012",
    "C00479865": "REPEAL OBAMACARE PAC",
    "C00524579": "REPLACE OBAMACARE DEMOCRATS",
    "C00449900": "ROCHESTER NY DELEGATES FOR OBAMA"
}

Scraping Evaluated Source + JS Forms

browser.runScripts = true;

Dragons

1. document.write()
2. Script errors halt execution
3. Zombie is buggy when scripts can run

// workaround to rewrite `document.write` to ""
// before jsdom parses it. Warning: dragons ahead!
http://j.mp/XSRKze

Phantom.js

Not Node! Headless WebKit

brew install phantomjs

http://phantomjs.org/download.html

Still. Dragons. Here's a simple example.

var page = require('webpage').create();

page.onConsoleMessage = function (msg, line, source) {
     console.log('console> ' + msg);
 };

page.open(URL, function() {
    page.includeJs("jquery.js", function() {
      var title = page.evaluate(function() {
        return $("title").html();
      });
    console.log(title);
  });
});

Thanks. Questions?

Al Shaw    NICAR 2013    @A_L