Web scraping with Node
Follow Along
http://j.mp/nicar-13-node
Schedule
1. Set up the room
2. Simple Sites
3. Forms
4. Scraping Evaluated Source
Set up the room.
brew install node
npm install -g jsdom
npm install -g request
npm install -g zombie
(or, windows installer: http://nodejs.org/download/)
add node & modules path to your .profile or .bashrc
export NODE_PATH="/usr/local/lib/node: \
/usr/local/lib/node_modules: \
/usr/local/share/npm/lib/node_modules"
The console
var request = require('request');
//=> undefined
// (it worked)
Simple Sites
var fs = require('fs');
var request = require('request');
var jsdom = require('jsdom').jsdom;
Get a web page.
var get = function(url, cb) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
cb(body);
}
});
};
Get a (jQueryified) DOM
var createDocument = function(html, cb) {
var document = jsdom(html);
var window = document.createWindow();
jsdom.jQueryify(window, cb);
};
Scrape.
var scrape = function(window) {
// do stuff with your window
};
So far:
get(url, function(body) {
createDocument(body, function(window) {
// do stuff with your window
});
});
FEC.gov
var url = "http://query.nictusa.com/cgi-bin/dcdev" +
"/forms/C00490045/766953/sa/11AI";
get(url, function(body) {
createDocument(body, function(window) {
// your scraper
var $ = window.$;
var json = [];
var donors = $("table tr td[align=LEFT]");
donors.each(function() {
var name = $(this).children("b").text();
json.push(name);
});
});
});
#=> Marc Andreessen
Michael Armstrong
Alan Ashton
Lynda Balfour
...
Save it as JSON.
jsonStr = JSON.stringify(json);
fs.writeFileSync("donors.json", jsonStr);
["Marc Andreessen","Michael Armstrong","Alan Ashton",
"Lynda Balfour","Frank Baxter","Glen Beck",
"James Berardinelli","Andrew Bernstein",
"Robert D Beyer","Lynn Booth","David Bradford",
"James Brown","Frank Bruno","Buena Vista Investments LLC",...]
Putting it together:
http://j.mp/VaMdb1
Forms
Sometimes you need to interact with web pages to get data out
Goal: Let's grab all the FEC-registered committees with Obama in their names
The form:
http://www.fec.gov/finance/disclosure/efile_search.shtml
We'll write a script to fill this out, and POST it:
Grab the H4's
Zombie.js
var fs = require("fs");
var Browser = require("zombie");
Set up Zombie
var browser = new Browser();
// use this unless you need evaluated source
browser.runScripts = false;
// because the FEC's site is so slow
browser.maxWait = 30000;
browser.waitFor = 30000;
1. Visit the page
2. Fill out the form
3. Press Button, Receive Data
// ZombieJS methods for
// interacting with forms
browser.visit()
browser.fill()
browser.pressButton()
var url = "http://www.fec.gov/finance/" +
"disclosure/efile_search.shtml"
browser.visit(url, function (e, browser, status) {
browser.
fill("name", "Obama").
pressButton("submit", function() {
var committees = browser.queryAll("h4 a");
committees.forEach(function(com) {
var cmte = com.innerHTML.split(/\s-\s/);
json[cmte[1]] = cmte[0];
});
jsonStr = JSON.stringify(json);
fs.writeFileSync("obama-cmtes.json", jsonStr);
});
});
Follow along:
http://j.mp/14UDZEo
Result
{
"C00505784": "AFRICA FOR OBAMA (AFO)",
"C00527127": "AMERICA VS OBAMA",
"C00496729": "BEAT OBAMA POLITICAL ACTION COMMITTEE",
"C00446179": "KOREAN AMERICANS FOR OBAMA",
"C00431445": "OBAMA FOR AMERICA",
"C00431130": "OBAMA FOR AMERICA DRAFT COMMITTEE",
"C00347583": "OBAMA FOR CONGRESS 2000",
"C00451393": "OBAMA VICTORY FUND",
"C00494740": "OBAMA VICTORY FUND 2012",
"C00479865": "REPEAL OBAMACARE PAC",
"C00524579": "REPLACE OBAMACARE DEMOCRATS",
"C00449900": "ROCHESTER NY DELEGATES FOR OBAMA"
}
Scraping Evaluated Source + JS Forms
browser.runScripts = true;
Dragons
1. document.write()
2. Script errors halt execution
3. Zombie is buggy when scripts can run
// workaround to rewrite `document.write` to ""
// before jsdom parses it. Warning: dragons ahead!
http://j.mp/XSRKze
Phantom.js
Not Node! Headless WebKit
brew install phantomjs
http://phantomjs.org/download.html
Still. Dragons. Here's a simple example.
var page = require('webpage').create();
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
page.open(URL, function() {
page.includeJs("jquery.js", function() {
var title = page.evaluate(function() {
return $("title").html();
});
console.log(title);
});
});
Thanks. Questions?
Al Shaw NICAR 2013 @A_L