Refactor
This commit is contained in:
parent
f3566cab3c
commit
bcfeae55b8
238
build/build.go
238
build/build.go
@ -24,7 +24,6 @@ package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"flag"
|
||||
"log"
|
||||
@ -34,14 +33,14 @@ import (
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant, error) {
|
||||
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
|
||||
file, err := os.Open(urlsPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var results []restaurant
|
||||
var reviews []review
|
||||
var scanner = bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
@ -51,24 +50,21 @@ func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant,
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var items []restaurant
|
||||
switch parsed.Host {
|
||||
case "tabelog.com":
|
||||
items = scrape(line, wc, gc, tabelog{})
|
||||
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
|
||||
case "www.tripadvisor.com":
|
||||
items = scrape(line, wc, gc, tripadvisor{})
|
||||
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
|
||||
default:
|
||||
return nil, errors.New("unsupported review site")
|
||||
}
|
||||
|
||||
results = append(results, items...)
|
||||
}
|
||||
}
|
||||
|
||||
return results, nil
|
||||
return reviews, nil
|
||||
}
|
||||
|
||||
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
|
||||
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
||||
gc, err := newGeoCache(geocachePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -80,131 +76,131 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, erro
|
||||
return nil, err
|
||||
}
|
||||
|
||||
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
|
||||
reviews, err := scrapeDataUrls(urlsPath, wc, gc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return restaurants, nil
|
||||
return reviews, nil
|
||||
}
|
||||
|
||||
func computeStnData(restaurants []restaurant, stationsPath string) error {
|
||||
sq, err := newStationQuery(stationsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// func computeStnData(reviews []restaurant, stationsPath string) error {
|
||||
// sq, err := newStationQuery(stationsPath)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
for i, _ := range restaurants {
|
||||
r := &restaurants[i]
|
||||
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
|
||||
}
|
||||
// for i, _ := range reviews {
|
||||
// r := &reviews[i]
|
||||
// r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
|
||||
// }
|
||||
|
||||
return nil
|
||||
}
|
||||
// return nil
|
||||
// }
|
||||
|
||||
func dumpData(dbPath string, restaraunts []restaurant) error {
|
||||
db, err := sql.Open("sqlite3", dbPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer db.Close()
|
||||
// func dumpData(dbPath string, restaraunts []restaurant) error {
|
||||
// db, err := sql.Open("sqlite3", dbPath)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// defer db.Close()
|
||||
|
||||
_, err = db.Exec(`
|
||||
DROP TABLE IF EXISTS reviews;
|
||||
CREATE TABLE reviews(
|
||||
name VARCHAR(100) NOT NULL,
|
||||
url VARCHAR(200) NOT NULL,
|
||||
delicious FLOAT NOT NULL,
|
||||
accommodating FLOAT NOT NULL,
|
||||
affordable FLOAT NOT NULL,
|
||||
atmospheric FLOAT NOT NULL,
|
||||
latitude FLOAT NOT NULL,
|
||||
longitude FLOAT NOT NULL,
|
||||
closestStnDist FLOAT NOT NULL,
|
||||
closestStnName VARCHAR(100) NOT NULL,
|
||||
accessCount INTEGER NOT NULL,
|
||||
id INTEGER PRIMARY KEY
|
||||
)`)
|
||||
// _, err = db.Exec(`
|
||||
// DROP TABLE IF EXISTS reviews;
|
||||
// CREATE TABLE reviews(
|
||||
// name VARCHAR(100) NOT NULL,
|
||||
// url VARCHAR(200) NOT NULL,
|
||||
// delicious FLOAT NOT NULL,
|
||||
// accommodating FLOAT NOT NULL,
|
||||
// affordable FLOAT NOT NULL,
|
||||
// atmospheric FLOAT NOT NULL,
|
||||
// latitude FLOAT NOT NULL,
|
||||
// longitude FLOAT NOT NULL,
|
||||
// closestStnDist FLOAT NOT NULL,
|
||||
// closestStnName VARCHAR(100) NOT NULL,
|
||||
// accessCount INTEGER NOT NULL,
|
||||
// id INTEGER PRIMARY KEY
|
||||
// )`)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
for _, r := range restaraunts {
|
||||
_, err = db.Exec(`
|
||||
INSERT INTO reviews(
|
||||
name,
|
||||
url,
|
||||
delicious,
|
||||
accommodating,
|
||||
affordable,
|
||||
atmospheric,
|
||||
latitude,
|
||||
longitude,
|
||||
closestStnDist,
|
||||
closestStnName,
|
||||
accessCount
|
||||
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
r.name,
|
||||
r.url,
|
||||
r.feats.delicious,
|
||||
r.feats.accommodating,
|
||||
r.feats.affordable,
|
||||
r.feats.atmospheric,
|
||||
r.latitude,
|
||||
r.longitude,
|
||||
r.closestStnDist,
|
||||
r.closestStnName,
|
||||
0)
|
||||
// for _, r := range restaraunts {
|
||||
// _, err = db.Exec(`
|
||||
// INSERT INTO reviews(
|
||||
// name,
|
||||
// url,
|
||||
// delicious,
|
||||
// accommodating,
|
||||
// affordable,
|
||||
// atmospheric,
|
||||
// latitude,
|
||||
// longitude,
|
||||
// closestStnDist,
|
||||
// closestStnName,
|
||||
// accessCount
|
||||
// ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
// r.name,
|
||||
// r.url,
|
||||
// r.feats.delicious,
|
||||
// r.feats.accommodating,
|
||||
// r.feats.affordable,
|
||||
// r.feats.atmospheric,
|
||||
// r.latitude,
|
||||
// r.longitude,
|
||||
// r.closestStnDist,
|
||||
// r.closestStnName,
|
||||
// 0)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// }
|
||||
|
||||
_, err = db.Exec(`
|
||||
DROP TABLE IF EXISTS categories;
|
||||
CREATE TABLE categories(
|
||||
description VARCHAR(200) NOT NULL,
|
||||
id INTEGER PRIMARY KEY)`)
|
||||
// _, err = db.Exec(`
|
||||
// DROP TABLE IF EXISTS categories;
|
||||
// CREATE TABLE categories(
|
||||
// description VARCHAR(200) NOT NULL,
|
||||
// id INTEGER PRIMARY KEY)`)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
|
||||
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
|
||||
// if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
|
||||
// return err
|
||||
// }
|
||||
// }
|
||||
|
||||
_, err = db.Exec(`
|
||||
DROP TABLE IF EXISTS history;
|
||||
CREATE TABLE history(
|
||||
date DATETIME NOT NULL,
|
||||
reviewId INTEGER NOT NULL,
|
||||
id INTEGER PRIMARY KEY,
|
||||
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
|
||||
// _, err = db.Exec(`
|
||||
// DROP TABLE IF EXISTS history;
|
||||
// CREATE TABLE history(
|
||||
// date DATETIME NOT NULL,
|
||||
// reviewId INTEGER NOT NULL,
|
||||
// id INTEGER PRIMARY KEY,
|
||||
// FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
_, err = db.Exec(`
|
||||
DROP TABLE IF EXISTS historyGroups;
|
||||
CREATE TABLE historyGroups(
|
||||
categoryId INTEGER NOT NULL,
|
||||
categoryValue FLOAT NOT NULL,
|
||||
historyId INTEGER NOT NULL,
|
||||
FOREIGN KEY(historyId) REFERENCES history(id),
|
||||
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
|
||||
// _, err = db.Exec(`
|
||||
// DROP TABLE IF EXISTS historyGroups;
|
||||
// CREATE TABLE historyGroups(
|
||||
// categoryId INTEGER NOT NULL,
|
||||
// categoryValue FLOAT NOT NULL,
|
||||
// historyId INTEGER NOT NULL,
|
||||
// FOREIGN KEY(historyId) REFERENCES history(id),
|
||||
// FOREIGN KEY(categoryId) REFERENCES categories(id))`)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
return nil
|
||||
}
|
||||
// return nil
|
||||
// }
|
||||
|
||||
func main() {
|
||||
dbPath := flag.String("db", "data/db.sqlite3", "database output path")
|
||||
@ -214,16 +210,16 @@ func main() {
|
||||
webcachePath := flag.String("webcache", "cache/webcache", "web data cache")
|
||||
flag.Parse()
|
||||
|
||||
restaurants, err := scrapeData(*urlsPath, *geocachePath, *webcachePath)
|
||||
reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
if err := computeStnData(restaurants, *stationsPath); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// if err := computeStnData(reviews, *stationsPath); err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
if err := dumpData(*dbPath, restaurants); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// if err := dumpData(*dbPath, reviews); err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
}
|
||||
|
@ -79,19 +79,23 @@ func (c *geoCache) save() error {
|
||||
return ioutil.WriteFile(c.filename, js, 0644)
|
||||
}
|
||||
|
||||
func (c *geoCache) decode(address string) (geoPos, error) {
|
||||
func (c *geoCache) decode(address string) (latitude float64, longitude float64, err error) {
|
||||
if pos, ok := c.data[address]; ok {
|
||||
return pos, nil
|
||||
latitude = pos.Latitude
|
||||
longitude = pos.Longitude
|
||||
return
|
||||
}
|
||||
|
||||
<-c.ticker.C
|
||||
|
||||
point, err := c.coder.Geocode(address)
|
||||
if err != nil {
|
||||
return geoPos{}, err
|
||||
return
|
||||
}
|
||||
|
||||
pos := geoPos{point.Lat(), point.Lng()}
|
||||
c.data[address] = pos
|
||||
return pos, nil
|
||||
latitude = point.Lat()
|
||||
longitude = point.Lng()
|
||||
|
||||
c.data[address] = geoPos{latitude, longitude}
|
||||
return
|
||||
}
|
||||
|
@ -30,19 +30,17 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type features struct {
|
||||
delicious float64
|
||||
accommodating float64
|
||||
affordable float64
|
||||
atmospheric float64
|
||||
type feature struct {
|
||||
value float64
|
||||
weight float64
|
||||
}
|
||||
|
||||
type restaurant struct {
|
||||
type review struct {
|
||||
name string
|
||||
address string
|
||||
url string
|
||||
|
||||
feats features
|
||||
features map[string]feature
|
||||
|
||||
latitude float64
|
||||
longitude float64
|
||||
@ -53,7 +51,15 @@ type restaurant struct {
|
||||
|
||||
type scraper interface {
|
||||
index(doc *goquery.Document) (string, []string)
|
||||
review(doc *goquery.Document) (string, string, features, error)
|
||||
review(doc *goquery.Document) (string, string, map[string]feature, error)
|
||||
}
|
||||
|
||||
type decoder interface {
|
||||
decode(address string) (float64, float64, error)
|
||||
}
|
||||
|
||||
type loader interface {
|
||||
load(url string) (*goquery.Document, error)
|
||||
}
|
||||
|
||||
func makeAbsUrl(ref, base string) (string, error) {
|
||||
@ -70,13 +76,12 @@ func makeAbsUrl(ref, base string) (string, error) {
|
||||
return b.ResolveReference(r).String(), nil
|
||||
}
|
||||
|
||||
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
|
||||
func decodeReviews(in chan review, out chan review, dec decoder) {
|
||||
for {
|
||||
if res, ok := <-in; ok {
|
||||
pos, err := gc.decode(res.address)
|
||||
var err error
|
||||
res.latitude, res.longitude, err = dec.decode(res.address)
|
||||
if err == nil {
|
||||
res.latitude = pos.Latitude
|
||||
res.longitude = pos.Longitude
|
||||
out <- res
|
||||
} else {
|
||||
log.Printf("failed to decode address for %s (%v)", res.url, err)
|
||||
@ -88,30 +93,30 @@ func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
|
||||
}
|
||||
}
|
||||
|
||||
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
|
||||
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
|
||||
defer group.Done()
|
||||
|
||||
doc, err := wc.load(url)
|
||||
doc, err := lod.load(url)
|
||||
if err != nil {
|
||||
log.Printf("failed to load review at %s (%v)", url, err)
|
||||
return
|
||||
}
|
||||
|
||||
name, address, feats, err := scr.review(doc)
|
||||
name, address, features, err := scr.review(doc)
|
||||
if err != nil {
|
||||
log.Printf("failed to scrape review at %s (%v)", url, err)
|
||||
return
|
||||
}
|
||||
|
||||
out <- restaurant{
|
||||
out <- review{
|
||||
name: name,
|
||||
address: address,
|
||||
feats: feats,
|
||||
features: features,
|
||||
url: url}
|
||||
}
|
||||
|
||||
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
|
||||
doc, err := wc.load(indexUrl)
|
||||
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
||||
doc, err := lod.load(indexUrl)
|
||||
if err != nil {
|
||||
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
||||
return
|
||||
@ -130,7 +135,7 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
|
||||
}
|
||||
|
||||
group.Add(1)
|
||||
go scrapeReview(absUrl, out, wc, &group, scr)
|
||||
go scrapeReview(absUrl, out, lod, scr, &group)
|
||||
}
|
||||
group.Wait()
|
||||
|
||||
@ -142,18 +147,18 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
scrapeIndex(absUrl, out, wc, scr)
|
||||
scrapeIndex(absUrl, out, lod, scr)
|
||||
}
|
||||
}
|
||||
|
||||
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant {
|
||||
out := make(chan restaurant, 128)
|
||||
in := make(chan restaurant, 128)
|
||||
func scrape(url string, lod loader, dec decoder, scr scraper) []review {
|
||||
out := make(chan review, 128)
|
||||
in := make(chan review, 128)
|
||||
|
||||
go scrapeIndex(url, in, wc, scr)
|
||||
go decodeReviews(in, out, gc)
|
||||
go scrapeIndex(url, in, lod, scr)
|
||||
go decodeReviews(in, out, dec)
|
||||
|
||||
var results []restaurant
|
||||
var results []review
|
||||
for {
|
||||
if res, ok := <-out; ok {
|
||||
results = append(results, res)
|
||||
|
@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
|
||||
return nextIndexUrl, reviewUrls
|
||||
}
|
||||
|
||||
func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) {
|
||||
func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
|
||||
name = doc.Find("a.rd-header__rst-name-main").Text()
|
||||
|
||||
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
|
||||
@ -60,19 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, feat feature
|
||||
return
|
||||
}
|
||||
|
||||
f := make(map[string]float64)
|
||||
features = make(map[string]feature)
|
||||
|
||||
for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} {
|
||||
text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
|
||||
if f[category], err = strconv.ParseFloat(text, 8); err != nil {
|
||||
valueText := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
|
||||
|
||||
var value float64
|
||||
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
|
||||
err = fmt.Errorf("invalid value for %s", category)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
feat.accommodating = f["service"]/2.5 - 1.0
|
||||
feat.affordable = f["cost"]/2.5 - 1.0
|
||||
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
|
||||
feat.delicious = f["dishes"]/2.5 - 1.0
|
||||
features[category] = feature{value/2.5 - 1.0, 1.0}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) {
|
||||
return nextIndexUrl, reviewUrls
|
||||
}
|
||||
|
||||
func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) {
|
||||
func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
|
||||
name = strings.TrimSpace(doc.Find("h1#HEADING").Text())
|
||||
address = strings.TrimSpace(doc.Find("address span.format_address").Text())
|
||||
|
||||
@ -60,20 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, feat fea
|
||||
return
|
||||
}
|
||||
|
||||
f := make(map[string]float64)
|
||||
features = make(map[string]feature)
|
||||
|
||||
for index, category := range []string{"food", "service", "value", "atmosphere"} {
|
||||
alt, _ := ratings.Eq(index).Attr("alt")
|
||||
rating := strings.Split(alt, " ")[0]
|
||||
if f[category], err = strconv.ParseFloat(rating, 8); err != nil {
|
||||
altText, _ := ratings.Eq(index).Attr("alt")
|
||||
valueText := strings.Split(altText, " ")[0]
|
||||
|
||||
var value float64
|
||||
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
|
||||
err = fmt.Errorf("invalid value for %s", category)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
feat.accommodating = f["service"]/2.5 - 1.0
|
||||
feat.affordable = f["value"]/2.5 - 1.0
|
||||
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
|
||||
feat.delicious = f["food"]/2.5 - 1.0
|
||||
features[category] = feature{value/2.5 - 1.0, 1.0}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user