Updating scraper
This commit is contained in:
parent
aecaf6a8f8
commit
7a8011fc0c
@ -13,6 +13,7 @@
|
|||||||
"mysql": "^2.5.0",
|
"mysql": "^2.5.0",
|
||||||
"underscore": "~1.6.0",
|
"underscore": "~1.6.0",
|
||||||
"node-geocoder": "~2.11.0",
|
"node-geocoder": "~2.11.0",
|
||||||
"geolib": "~2.0.14"
|
"geolib": "~2.0.14",
|
||||||
|
"jsonfile": "~2.0.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
58
db/scrape.js
58
db/scrape.js
@ -21,12 +21,13 @@
|
|||||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
var cheerio = require('cheerio');
|
var cheerio = require('cheerio');
|
||||||
var request = require('request');
|
var request = require('request');
|
||||||
var url = require('url');
|
var underscore = require('underscore');
|
||||||
var path = require('path');
|
var url = require('url');
|
||||||
var fs = require('fs');
|
var path = require('path');
|
||||||
var _ = require('underscore');
|
var fs = require('fs');
|
||||||
|
var _ = require('underscore');
|
||||||
|
|
||||||
|
|
||||||
function requestCached(relativeUrl, callback) {
|
function requestCached(relativeUrl, callback) {
|
||||||
@ -45,8 +46,7 @@ function requestCached(relativeUrl, callback) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getBarPercent(bar) {
|
function getBarPercent(bar) {
|
||||||
var width = bar.css('width');
|
return parseFloat(bar.attr('alt')) / 5.0;
|
||||||
return parseInt(width) / 91.0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function reviewScraped(err, resp, html) {
|
function reviewScraped(err, resp, html) {
|
||||||
@ -56,27 +56,31 @@ function reviewScraped(err, resp, html) {
|
|||||||
|
|
||||||
var $ = cheerio.load(html);
|
var $ = cheerio.load(html);
|
||||||
|
|
||||||
var address = $('div.addr').text().trim();
|
var address = $('address span.format_address').text().trim();
|
||||||
if (!address) {
|
if (!address) {
|
||||||
|
console.warn('Warning: review skipped, no address');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var storeName = $('h1#HEADING').text().trim();
|
var storeName = $('h1#HEADING').text().trim();
|
||||||
if (storeName.indexOf('CLOSED') !== -1) {
|
if (storeName.indexOf('CLOSED') !== -1) {
|
||||||
|
console.warn('Warning: review skipped, closed');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var bars = $('div.fill');
|
var rating = $('ul.barChart img.rating_s_fill');
|
||||||
if (bars.length !== 9) {
|
if (rating.length != 4) {
|
||||||
|
console.warn('Warning: review skipped, no summary');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var rateFood = getBarPercent($(bars[5]));
|
var rateFood = getBarPercent($(rating[0]));
|
||||||
var rateService = getBarPercent($(bars[6]));
|
var rateService = getBarPercent($(rating[1]));
|
||||||
var rateValue = getBarPercent($(bars[7]));
|
var rateValue = getBarPercent($(rating[2]));
|
||||||
var rateAtmosphere = getBarPercent($(bars[8]));
|
var rateAtmosphere = getBarPercent($(rating[3]));
|
||||||
|
|
||||||
if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) {
|
if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) {
|
||||||
|
console.warn('Warning: review skipped, empty review');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,6 +96,10 @@ function reviewScraped(err, resp, html) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (data.rating.food < 0) {
|
||||||
|
console.assert(blah);
|
||||||
|
}
|
||||||
|
|
||||||
this.callback(data);
|
this.callback(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,7 +149,12 @@ function scrapeIndices(relativeUrl, callback) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function main() {
|
function main() {
|
||||||
var relativePath = '/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html';
|
var relativePaths = [
|
||||||
|
'/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html',
|
||||||
|
'/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html',
|
||||||
|
'/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html',
|
||||||
|
'/Restaurants-g298184-Tokyo_Tokyo_Prefecture_Kanto.html'
|
||||||
|
];
|
||||||
var databasePath = 'data.json';
|
var databasePath = 'data.json';
|
||||||
|
|
||||||
var abort = false;
|
var abort = false;
|
||||||
@ -151,15 +164,18 @@ function main() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
var results = [];
|
var results = [];
|
||||||
scrapeIndices(relativePath, function(relativeUrl) {
|
_.each(relativePaths, function(relativePath) {
|
||||||
scrapeReview(relativeUrl, function(data) {
|
scrapeIndices(relativePath, function(relativeUrl) {
|
||||||
results.push(data);
|
scrapeReview(relativeUrl, function(data) {
|
||||||
});
|
results.push(data);
|
||||||
|
});
|
||||||
|
|
||||||
return abort;
|
return abort;
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
process.on('exit', function() {
|
process.on('exit', function() {
|
||||||
|
console.log('Total reviews scraped: %d', results.length);
|
||||||
var strData = JSON.stringify(results, null, 4);
|
var strData = JSON.stringify(results, null, 4);
|
||||||
fs.writeFileSync(databasePath, strData);
|
fs.writeFileSync(databasePath, strData);
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user