1
restaurant-search/scrape.go

227 lines
5.3 KiB
Go
Raw Normal View History

2015-08-11 11:30:42 +00:00
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
2015-08-21 09:21:52 +00:00
import (
"bufio"
2015-08-23 06:16:01 +00:00
"database/sql"
2015-08-21 09:21:52 +00:00
"errors"
"net/url"
"os"
2015-08-23 06:16:01 +00:00
_ "github.com/mattn/go-sqlite3"
2015-08-21 09:21:52 +00:00
)
2015-08-16 10:12:16 +00:00
2015-08-23 08:03:30 +00:00
func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) {
2015-08-21 09:21:52 +00:00
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
var results []restaurant
2015-08-22 09:14:42 +00:00
var scanner = bufio.NewScanner(file)
2015-08-21 09:21:52 +00:00
for scanner.Scan() {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
var items []restaurant
switch parsed.Host {
case "tabelog.com":
items = scrape(line, wc, gc, tabelog{})
case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{})
default:
return nil, errors.New("unsupported review site")
}
results = append(results, items...)
}
}
return results, nil
2015-08-17 05:23:03 +00:00
}
2015-08-22 09:14:42 +00:00
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
gc, err := newGeoCache(geocachePath)
2015-08-16 10:12:16 +00:00
if err != nil {
2015-08-22 09:14:42 +00:00
return nil, err
2015-08-16 10:12:16 +00:00
}
defer gc.save()
2015-08-22 09:14:42 +00:00
wc, err := newWebCache(webcachePath)
2015-08-16 10:12:16 +00:00
if err != nil {
2015-08-22 09:14:42 +00:00
return nil, err
2015-08-16 10:12:16 +00:00
}
2015-08-23 08:03:30 +00:00
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
2015-08-22 08:54:46 +00:00
if err != nil {
2015-08-22 09:14:42 +00:00
return nil, err
}
return restaurants, nil
}
2015-08-23 08:03:30 +00:00
func computeStnData(restaurants []restaurant, stationsPath string) error {
2015-08-22 09:14:42 +00:00
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
2015-08-22 08:54:46 +00:00
}
for i, _ := range restaurants {
r := &restaurants[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
}
2015-08-22 09:14:42 +00:00
return nil
}
2015-08-23 06:43:07 +00:00
func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) {
return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"]
}
2015-08-23 06:16:01 +00:00
func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
2015-08-23 06:36:37 +00:00
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
2015-08-23 07:54:49 +00:00
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
2015-08-23 06:36:37 +00:00
)`)
2015-08-23 06:16:01 +00:00
if err != nil {
return err
}
for _, r := range restaraunts {
2015-08-23 06:43:07 +00:00
delicious, accommodating, affordable, atmospheric := buildFeatures(r)
2015-08-23 06:16:01 +00:00
_, err = db.Exec(`
INSERT INTO reviews(
name,
url,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name,
r.url,
2015-08-23 06:43:07 +00:00
delicious,
accommodating,
affordable,
atmospheric,
2015-08-23 06:16:01 +00:00
r.longitude,
r.latitude,
r.closestStnDist,
2015-08-23 06:36:37 +00:00
r.closestStnName,
0)
2015-08-23 06:16:01 +00:00
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
2015-08-23 07:54:49 +00:00
id INTEGER PRIMARY KEY)`)
2015-08-23 06:16:01 +00:00
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
2015-08-23 07:54:49 +00:00
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
2015-08-23 06:16:01 +00:00
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
2015-08-23 07:54:49 +00:00
categoryId INTEGER NOT NULL,
2015-08-23 06:16:01 +00:00
categoryValue FLOAT NOT NULL,
2015-08-23 07:54:49 +00:00
historyId INTEGER NOT NULL,
2015-08-23 06:16:01 +00:00
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}
2015-08-22 09:14:42 +00:00
func main() {
2015-08-23 06:16:01 +00:00
restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache")
2015-08-22 09:14:42 +00:00
if err != nil {
panic(err)
}
2015-08-23 08:03:30 +00:00
if err := computeStnData(restaurants, "data/stations.json"); err != nil {
2015-08-23 06:16:01 +00:00
panic(err)
}
2015-08-23 08:16:01 +00:00
if err := dumpData("data/db.sqlite3", restaurants); err != nil {
2015-08-22 06:20:38 +00:00
panic(err)
2015-08-17 05:23:03 +00:00
}
2015-08-11 11:30:42 +00:00
}