1

Switching to sqlite3

This commit is contained in:
Alex Yatskov 2015-08-23 17:17:38 +09:00
parent 6f57277ed3
commit c63c6a835f
16 changed files with 4 additions and 50691 deletions

1
db/.gitignore vendored
View File

@ -1 +0,0 @@
node_modules

View File

@ -1,101 +0,0 @@
#!/usr/bin/env node
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
var mysql = require('mysql');
var data = require('./data.json');
//
// Setup
//
var conn = mysql.createConnection({host: 'localhost', user: 'hscd'});
conn.query('DROP DATABASE IF EXISTS hscd');
conn.query('CREATE DATABASE hscd');
conn.query('USE hscd');
//
// Reviews
//
conn.query('DROP TABLE IF EXISTS reviews');
conn.query('CREATE TABLE reviews(name VARCHAR(100) NOT NULL, url VARCHAR(200) NOT NULL, delicious FLOAT NOT NULL, accommodating FLOAT NOT NULL, affordable FLOAT NOT NULL, atmospheric FLOAT NOT NULL, latitude FLOAT NOT NULL, longitude FLOAT NOT NULL, distanceToStn FLOAT NOT NULL, closestStn VARCHAR(100) NOT NULL, accessCount INT NOT NULL, id INT NOT NULL AUTO_INCREMENT PRIMARY KEY) DEFAULT CHARACTER SET utf8');
for (var i = 0, count = data.length; i < count; ++i) {
var record = data[i];
conn.query('INSERT INTO reviews(name, url, delicious, accommodating, affordable, atmospheric, latitude, longitude, distanceToStn, closestStn, accessCount) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', [
record.name,
record.relativeUrl,
record.rating.food,
record.rating.service,
record.rating.value,
record.rating.atmosphere,
record.geo.latitude,
record.geo.longitude,
record.distanceToStn,
record.closestStn,
0
]);
}
//
// Categories
//
conn.query('DROP TABLE IF EXISTS categories');
conn.query('CREATE TABLE categories(description VARCHAR(200) NOT NULL, id INT NOT NULL AUTO_INCREMENT PRIMARY KEY)');
var categories = [
'I prefer quiet places',
'I enjoy Mexican Food',
'I drive a car'
];
for (var i = 0, count = categories.length; i < count; ++i) {
conn.query('INSERT INTO categories(description) VALUES (?)', [categories[i]]);
}
//
// History
//
conn.query('DROP TABLE IF EXISTS history');
conn.query('CREATE TABLE history(date DATETIME NOT NULL, reviewId INT NOT NULL, id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, FOREIGN KEY(reviewId) REFERENCES reviews(id))');
//
// HistoryGroup
//
conn.query('DROP TABLE IF EXISTS historyGroups');
conn.query('CREATE TABLE historyGroups(categoryId INT NOT NULL, categoryValue FLOAT NOT NULL, historyId INT NOT NULL, FOREIGN KEY(historyId) REFERENCES history(id), FOREIGN KEY(categoryId) REFERENCES categories(id))');
//
// Cleanup
//
conn.end();

2
db/cache/.gitignore vendored
View File

@ -1,2 +0,0 @@
*.html*
!.gitignore

15626
db/cache/geo.json vendored

File diff suppressed because it is too large Load Diff

27627
db/data.json

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +0,0 @@
#!/bin/bash
mysqldump -u hscd hscd > hscd.sql

View File

@ -1,102 +0,0 @@
#!/usr/bin/env node
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
var _ = require('underscore');
var geocoder = require('node-geocoder');
var geolib = require('geolib');
var jf = require('jsonfile');
function queryPosition(gc, address, cache, sequence, callback) {
if (_.has(cache, address)) {
console.log('Cache lookup success for:\n\t%s', address);
callback(cache[address]);
return sequence;
}
setTimeout(function() {
gc.geocode(address, function(err, res) {
if (err) {
console.log('Geocode lookup fail for: \n\t%s', address);
callback(null);
}
else {
console.log('Geocode lookup success for: \n\t%s', address);
callback(cache[address] = res[0]);
}
});
}, sequence * 200);
return sequence + 1;
}
function buildAccess(reviewData, stationData, accessibility) {
_.each(reviewData, function(reviewItem) {
var distMin = Number.MAX_VALUE;
var station = '';
console.log('Computing access for: \n\t%s', reviewItem.name);
_.each(stationData, function(stationItem, stationName) {
var distance = geolib.getDistance(reviewItem.geo, stationItem.geo);
if (distance < distMin) {
station = stationName;
distMin = distance;
}
});
reviewItem.distanceToStn = distMin;
reviewItem.closestStn = station;
});
}
function main() {
var gc = geocoder.getGeocoder('google', 'http', {});
var sequence = 0;
var stationData = jf.readFileSync('stations.json');
var cacheData = jf.readFileSync('cache/geo.json', {throws: false}) || {};
var reviewData = jf.readFileSync('data.json');
var reviewCount = reviewData.length;
var reviewDataDest = [];
_.each(reviewData, function(reviewItem) {
sequence = queryPosition(gc, reviewItem.address, cacheData, sequence, function(geo) {
if (geo) {
var destItem = _.clone(reviewItem);
destItem.geo = geo;
reviewDataDest.push(destItem);
}
if (--reviewCount === 0) {
buildAccess(reviewDataDest, stationData);
jf.writeFileSync('data.json', reviewDataDest);
jf.writeFileSync('cache/geo.json', cacheData);
}
});
});
}
if (require.main === module) {
main();
}

File diff suppressed because one or more lines are too long

View File

@ -1,2 +0,0 @@
#!/bin/bash
mysql -p -u root < init.sql

View File

@ -1,4 +0,0 @@
CREATE DATABASE hscd;
USE hscd;
CREATE USER 'hscd'@'localhost';
GRANT ALL PRIVILEGES ON hscd . * to 'hscd'@'localhost';

View File

@ -1,2 +0,0 @@
#!/bin/bash
mysql -u hscd hscd < hscd.sql

View File

@ -1,19 +0,0 @@
{
"name": "hscd_scrape",
"version": "0.0.0",
"description": "",
"main": "scrape.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "MIT",
"dependencies": {
"cheerio": "~0.17.0",
"mysql": "^2.5.0",
"underscore": "~1.6.0",
"node-geocoder": "~2.11.0",
"geolib": "~2.0.14",
"jsonfile": "~2.0.0"
}
}

View File

@ -1,185 +0,0 @@
#!/usr/bin/env node
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
var cheerio = require('cheerio');
var request = require('request');
var underscore = require('underscore');
var url = require('url');
var path = require('path');
var fs = require('fs');
var _ = require('underscore');
function requestCached(relativeUrl, callback) {
var absoluteUrl = url.resolve('http://www.tripadvisor.com', relativeUrl);
var cachePath = path.join('cache', relativeUrl);
fs.readFile(cachePath, function(err, data) {
if (err) {
var stream = fs.createWriteStream(cachePath);
request(absoluteUrl, callback).pipe(stream);
}
else {
callback(null, null, data);
}
});
}
function getBarPercent(bar) {
return parseFloat(bar.attr('alt')) / 5.0;
}
function reviewScraped(err, resp, html) {
if (err) {
return console.error('Error: %s', err);
}
var $ = cheerio.load(html);
var address = $('address span.format_address').text().trim();
if (!address) {
console.warn('Warning: review skipped, no address');
return;
}
var storeName = $('h1#HEADING').text().trim();
if (storeName.indexOf('CLOSED') !== -1) {
console.warn('Warning: review skipped, closed');
return;
}
var rating = $('ul.barChart img.rating_s_fill');
if (rating.length != 4) {
console.warn('Warning: review skipped, no summary');
return;
}
var rateFood = getBarPercent($(rating[0]));
var rateService = getBarPercent($(rating[1]));
var rateValue = getBarPercent($(rating[2]));
var rateAtmosphere = getBarPercent($(rating[3]));
if (rateFood === 0.0 && rateService === 0.0 && rateValue === 0.0 && rateAtmosphere === 0.0) {
console.warn('Warning: review skipped, empty review');
return;
}
var data = {
name: storeName,
relativeUrl: this.relativeUrl,
address: address,
rating: {
food: (rateFood - 0.5) * 2.0,
service: (rateService - 0.5) * 2.0,
value: (rateValue - 0.5) * 2.0,
atmosphere: (rateAtmosphere - 0.5) * 2.0
}
};
this.callback(data);
}
function scrapeReview(relativeUrl, callback) {
console.log('Scraping review %s...', relativeUrl);
var c = _.bind(reviewScraped, {
callback: callback,
relativeUrl: relativeUrl
});
requestCached(relativeUrl, c);
}
function indexScraped(err, resp, html) {
if (err) {
return console.error('Error: %s', err);
}
var $ = cheerio.load(html);
var that = this;
var abort = false;
$('a.property_title').each(function(index, element) {
if (abort) {
return;
}
var reviewUrl = $(element).attr('href');
if (that.callback(reviewUrl)) {
abort = true;
}
});
if (!abort) {
var nextPageUrl = $('a.sprite-pageNext').attr('href');
if (nextPageUrl) {
scrapeIndices(nextPageUrl, this.callback);
}
}
}
function scrapeIndices(relativeUrl, callback) {
console.log('Scraping index %s...', relativeUrl);
var c = _.bind(indexScraped, { callback: callback });
requestCached(relativeUrl, c);
}
function main() {
var relativePaths = [
'/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html',
'/Restaurants-g1066854-Shinagawa_Tokyo_Tokyo_Prefecture_Kanto.html',
'/Restaurants-g298184-Tokyo_Tokyo_Prefecture_Kanto.html'
];
var databasePath = 'data.json';
var abort = false;
process.on('SIGINT', function() {
console.warn('Caught SIGINT, aborting...');
abort = true;
});
var results = [];
_.each(relativePaths, function(relativePath) {
scrapeIndices(relativePath, function(relativeUrl) {
scrapeReview(relativeUrl, function(data) {
results.push(data);
});
return abort;
});
});
process.on('exit', function() {
console.log('Total reviews scraped: %d', results.length);
var strData = JSON.stringify(results, null, 4);
fs.writeFileSync(databasePath, strData);
});
}
if (require.main === module) {
main();
}

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,7 @@ import (
"time"
"github.com/GaryBoone/GoStats/stats"
_ "github.com/go-sql-driver/mysql"
_ "github.com/mattn/go-sqlite3"
)
var db *sql.DB
@ -312,12 +312,12 @@ func main() {
staticDir := flag.String("static", "static", "path to static files")
portNum := flag.Int("port", 8080, "port to serve content on")
dataSrc := flag.String("data", "hscd@/hscd", "data source for database")
dataSrc := flag.String("data", "db.sqlite3", "data source for database")
profile := flag.String("profile", "", "write cpu profile to file")
flag.Parse()
var err error
if db, err = sql.Open("mysql", *dataSrc); err != nil {
if db, err = sql.Open("sqlite3", *dataSrc); err != nil {
log.Fatal(err)
}
defer db.Close()

View File

@ -270,7 +270,7 @@ func computeRecordsCompat(entries records, context queryContext) {
}
func getRecords(context queryContext) records {
recordRows, err := db.Query("SELECT name, url, delicious, accommodating, affordable, atmospheric, latitude, longitude, distanceToStn, closestStn, accessCount, id FROM reviews")
recordRows, err := db.Query("SELECT name, url, delicious, accommodating, affordable, atmospheric, latitude, longitude, closestStnDist, closestStnName, accessCount, id FROM reviews")
if err != nil {
log.Fatal(err)
}