Web scraping with Node NICAR12

slides
  http://j.mp/node-nicar  
code
  http://j.mp/node-nicar-js
me
  @A_L
  

JavaScript running locally.

That means you get the fileystem, HTTP

and jQuery.

You know this!

$("h2").each(function() {
  console.log($(this).html())
});
    

Set up the room.

brew install node

curl http://npmjs.org/install.sh | sh

npm install -g jsdom

npm install -g request
    
add node & modules path to your .profile or .bashrc

export NODE_PATH="/usr/local/lib/node:/usr/local/lib/node_modules"
var fs       = require('fs');
var request  = require('request');
var jsdom    = require('jsdom').jsdom;
      

Get a web page.

var get = function(url, cb) {
  request(url, function (error, response, body) {
    if (!error && response.statusCode == 200) {
      cb(body);
    }
  });
};
          

Get a (jQueryified) DOM!

var createDocument = function(html, cb) {
  var document = jsdom(html);
  var window   = document.createWindow();
  jsdom.jQueryify(window, cb);
};
              

Scrape.

var scrape = function(window) {
  // do stuff with your window
};  

So far:

get(url, function(body) {
  createDocument(body, function(window) {
    // do stuff with your window
  });
});

FEC.gov

var url = "http://query.nictusa.com/cgi-bin/dcdev" +
          "/forms/C00490045/766953/sa/11AI";

get(url, function(body) {
  createDocument(body, function(window) {
   
    // your scraper
    var $      = window.$;
    var json   = [];
    var donors = $("table tr td[align=LEFT]");
    
    donors.each(function() {
      var name = $($(this).html().split("<br>")[0]).text();
      json.push(name);
    });    

  });
});

#=> Marc Andreessen
    Michael Armstrong
    Alan Ashton
    Lynda Balfour
    ...

Save it as JSON.

jsonStr = JSON.stringify(json);
fs.writeFileSync("donors.json", jsonStr);
  
["Marc Andreessen","Michael Armstrong","Alan Ashton",
"Lynda Balfour","Frank Baxter","Glen Beck",
"James Berardinelli","Andrew Bernstein",
"Robert D Beyer","Lynn Booth","David Bradford",
"James Brown","Frank Bruno","Buena Vista Investments LLC",...]
  

Thank you.

Al Shaw    NICAR 2012    @A_L