1
restaurant-search/db/scrape.js

186 lines
5.5 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
2014-07-08 04:35:52 +00:00
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
2015-03-26 06:46:09 +00:00
var cheerio = require('cheerio');
var request = require('request');
var underscore = require('underscore');
var url = require('url');
var path = require('path');
var fs = require('fs');
var _ = require('underscore');
2014-07-08 04:35:52 +00:00
function requestCached(relativeUrl, callback) {
var absoluteUrl = url.resolve('http://www.tripadvisor.com', relativeUrl);
var cachePath = path.join('cache', relativeUrl);
fs.readFile(cachePath, function(err, data) {
if (err) {
var stream = fs.createWriteStream(cachePath);
request(absoluteUrl, callback).pipe(stream);
}
else {
callback(null, null, data);
}
});
}
function getBarPercent(bar) {
2015-03-26 06:46:09 +00:00
return parseFloat(bar.attr('alt')) / 5.0;
2014-07-08 04:35:52 +00:00
}
function reviewScraped(err, resp, html) {
if (err) {
return console.error('Error: %s', err);
}
var $ = cheerio.load(html);
2015-03-26 06:46:09 +00:00
var address = $('address span.format_address').text().trim();
2014-11-17 04:23:58 +00:00
if (!address) {
2015-03-26 06:46:09 +00:00
console.warn('Warning: review skipped, no address');
2014-07-08 04:35:52 +00:00
return;
}
var storeName = $('h1#HEADING').text().trim();
2014-11-17 04:23:58 +00:00
if (storeName.indexOf('CLOSED') !== -1) {
2015-03-26 06:46:09 +00:00
console.warn('Warning: review skipped, closed');
2014-11-17 04:23:58 +00:00
return;
}
2015-03-26 06:46:09 +00:00
var rating = $('ul.barChart img.rating_s_fill');
if (rating.length != 4) {
console.warn('Warning: review skipped, no summary');
2014-07-08 04:35:52 +00:00
return;
}
2015-03-26 06:46:09 +00:00
var rateFood = getBarPercent($(rating[0]));
var rateService = getBarPercent($(rating[1]));
var rateValue = getBarPercent($(rating[2]));
var rateAtmosphere = getBarPercent($(rating[3]));
2014-07-08 04:35:52 +00:00
if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) {
2015-03-26 06:46:09 +00:00
console.warn('Warning: review skipped, empty review');
2014-07-08 04:35:52 +00:00
return;
}
var data = {
2014-11-17 04:23:58 +00:00
name: storeName,
relativeUrl: this.relativeUrl,
address: address,
rating: {
food: (rateFood - 0.5) * 2.0,
service: (rateService - 0.5) * 2.0,
value: (rateValue - 0.5) * 2.0,
atmosphere: (rateAtmosphere - 0.5) * 2.0
2014-07-08 04:35:52 +00:00
}
};
this.callback(data);
}
function scrapeReview(relativeUrl, callback) {
console.log('Scraping review %s...', relativeUrl);
var c = _.bind(reviewScraped, {
callback: callback,
relativeUrl: relativeUrl
});
requestCached(relativeUrl, c);
}
function indexScraped(err, resp, html) {
if (err) {
return console.error('Error: %s', err);
}
var $ = cheerio.load(html);
var that = this;
var abort = false;
$('a.property_title').each(function(index, element) {
if (abort) {
return;
}
var reviewUrl = $(element).attr('href');
if (that.callback(reviewUrl)) {
abort = true;
}
});
if (!abort) {
var nextPageUrl = $('a.sprite-pageNext').attr('href');
if (nextPageUrl) {
scrapeIndices(nextPageUrl, this.callback);
}
}
}
function scrapeIndices(relativeUrl, callback) {
console.log('Scraping index %s...', relativeUrl);
var c = _.bind(indexScraped, { callback: callback });
requestCached(relativeUrl, c);
}
function main() {
2015-03-26 06:46:09 +00:00
var relativePaths = [
'/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html',
2015-03-26 07:15:19 +00:00
'/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1066854-Shinagawa_Tokyo_Tokyo_Prefecture_Kanto.html',
2015-03-26 06:46:09 +00:00
'/Restaurants-g298184-Tokyo_Tokyo_Prefecture_Kanto.html'
];
2014-07-08 04:35:52 +00:00
var databasePath = 'data.json';
var abort = false;
process.on('SIGINT', function() {
console.warn('Caught SIGINT, aborting...');
abort = true;
});
var results = [];
2015-03-26 06:46:09 +00:00
_.each(relativePaths, function(relativePath) {
scrapeIndices(relativePath, function(relativeUrl) {
scrapeReview(relativeUrl, function(data) {
results.push(data);
});
2014-07-08 04:35:52 +00:00
2015-03-26 06:46:09 +00:00
return abort;
});
2014-07-08 04:35:52 +00:00
});
process.on('exit', function() {
2015-03-26 06:46:09 +00:00
console.log('Total reviews scraped: %d', results.length);
var strData = JSON.stringify(results, null, 4);
2014-07-08 04:35:52 +00:00
fs.writeFileSync(databasePath, strData);
});
}
if (require.main === module) {
main();
}