1

Updating scraper

This commit is contained in:
Alex Yatskov 2015-03-26 15:46:09 +09:00
parent aecaf6a8f8
commit 7a8011fc0c
2 changed files with 39 additions and 22 deletions

View File

@ -13,6 +13,7 @@
"mysql": "^2.5.0", "mysql": "^2.5.0",
"underscore": "~1.6.0", "underscore": "~1.6.0",
"node-geocoder": "~2.11.0", "node-geocoder": "~2.11.0",
"geolib": "~2.0.14" "geolib": "~2.0.14",
"jsonfile": "~2.0.0"
} }
} }

View File

@ -23,6 +23,7 @@
var cheerio = require('cheerio'); var cheerio = require('cheerio');
var request = require('request'); var request = require('request');
var underscore = require('underscore');
var url = require('url'); var url = require('url');
var path = require('path'); var path = require('path');
var fs = require('fs'); var fs = require('fs');
@ -45,8 +46,7 @@ function requestCached(relativeUrl, callback) {
} }
function getBarPercent(bar) { function getBarPercent(bar) {
var width = bar.css('width'); return parseFloat(bar.attr('alt')) / 5.0;
return parseInt(width) / 91.0;
} }
function reviewScraped(err, resp, html) { function reviewScraped(err, resp, html) {
@ -56,27 +56,31 @@ function reviewScraped(err, resp, html) {
var $ = cheerio.load(html); var $ = cheerio.load(html);
var address = $('div.addr').text().trim(); var address = $('address span.format_address').text().trim();
if (!address) { if (!address) {
console.warn('Warning: review skipped, no address');
return; return;
} }
var storeName = $('h1#HEADING').text().trim(); var storeName = $('h1#HEADING').text().trim();
if (storeName.indexOf('CLOSED') !== -1) { if (storeName.indexOf('CLOSED') !== -1) {
console.warn('Warning: review skipped, closed');
return; return;
} }
var bars = $('div.fill'); var rating = $('ul.barChart img.rating_s_fill');
if (bars.length !== 9) { if (rating.length != 4) {
console.warn('Warning: review skipped, no summary');
return; return;
} }
var rateFood = getBarPercent($(bars[5])); var rateFood = getBarPercent($(rating[0]));
var rateService = getBarPercent($(bars[6])); var rateService = getBarPercent($(rating[1]));
var rateValue = getBarPercent($(bars[7])); var rateValue = getBarPercent($(rating[2]));
var rateAtmosphere = getBarPercent($(bars[8])); var rateAtmosphere = getBarPercent($(rating[3]));
if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) { if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) {
console.warn('Warning: review skipped, empty review');
return; return;
} }
@ -92,6 +96,10 @@ function reviewScraped(err, resp, html) {
} }
}; };
if (data.rating.food < 0) {
console.assert(blah);
}
this.callback(data); this.callback(data);
} }
@ -141,7 +149,12 @@ function scrapeIndices(relativeUrl, callback) {
} }
function main() { function main() {
var relativePath = '/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html'; var relativePaths = [
'/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g298184-Tokyo_Tokyo_Prefecture_Kanto.html'
];
var databasePath = 'data.json'; var databasePath = 'data.json';
var abort = false; var abort = false;
@ -151,6 +164,7 @@ function main() {
}); });
var results = []; var results = [];
_.each(relativePaths, function(relativePath) {
scrapeIndices(relativePath, function(relativeUrl) { scrapeIndices(relativePath, function(relativeUrl) {
scrapeReview(relativeUrl, function(data) { scrapeReview(relativeUrl, function(data) {
results.push(data); results.push(data);
@ -158,8 +172,10 @@ function main() {
return abort; return abort;
}); });
});
process.on('exit', function() { process.on('exit', function() {
console.log('Total reviews scraped: %d', results.length);
var strData = JSON.stringify(results, null, 4); var strData = JSON.stringify(results, null, 4);
fs.writeFileSync(databasePath, strData); fs.writeFileSync(databasePath, strData);
}); });