1
This commit is contained in:
Alex Yatskov 2015-09-17 15:54:32 +09:00
parent f3566cab3c
commit bcfeae55b8
5 changed files with 181 additions and 176 deletions

View File

@ -24,7 +24,6 @@ package main
import ( import (
"bufio" "bufio"
"database/sql"
"errors" "errors"
"flag" "flag"
"log" "log"
@ -34,14 +33,14 @@ import (
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"
) )
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant, error) { func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
file, err := os.Open(urlsPath) file, err := os.Open(urlsPath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer file.Close() defer file.Close()
var results []restaurant var reviews []review
var scanner = bufio.NewScanner(file) var scanner = bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@ -51,24 +50,21 @@ func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant,
return nil, err return nil, err
} }
var items []restaurant
switch parsed.Host { switch parsed.Host {
case "tabelog.com": case "tabelog.com":
items = scrape(line, wc, gc, tabelog{}) reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
case "www.tripadvisor.com": case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{}) reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
default: default:
return nil, errors.New("unsupported review site") return nil, errors.New("unsupported review site")
} }
results = append(results, items...)
} }
} }
return results, nil return reviews, nil
} }
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) { func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
gc, err := newGeoCache(geocachePath) gc, err := newGeoCache(geocachePath)
if err != nil { if err != nil {
return nil, err return nil, err
@ -80,131 +76,131 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, erro
return nil, err return nil, err
} }
restaurants, err := scrapeDataUrls(urlsPath, wc, gc) reviews, err := scrapeDataUrls(urlsPath, wc, gc)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return restaurants, nil return reviews, nil
} }
func computeStnData(restaurants []restaurant, stationsPath string) error { // func computeStnData(reviews []restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath) // sq, err := newStationQuery(stationsPath)
if err != nil { // if err != nil {
return err // return err
} // }
for i, _ := range restaurants { // for i, _ := range reviews {
r := &restaurants[i] // r := &reviews[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude) // r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
} // }
return nil // return nil
} // }
func dumpData(dbPath string, restaraunts []restaurant) error { // func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath) // db, err := sql.Open("sqlite3", dbPath)
if err != nil { // if err != nil {
return err // return err
} // }
defer db.Close() // defer db.Close()
_, err = db.Exec(` // _, err = db.Exec(`
DROP TABLE IF EXISTS reviews; // DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews( // CREATE TABLE reviews(
name VARCHAR(100) NOT NULL, // name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL, // url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL, // delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL, // accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL, // affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL, // atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL, // latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL, // longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL, // closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL, // closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL, // accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY // id INTEGER PRIMARY KEY
)`) // )`)
if err != nil { // if err != nil {
return err // return err
} // }
for _, r := range restaraunts { // for _, r := range restaraunts {
_, err = db.Exec(` // _, err = db.Exec(`
INSERT INTO reviews( // INSERT INTO reviews(
name, // name,
url, // url,
delicious, // delicious,
accommodating, // accommodating,
affordable, // affordable,
atmospheric, // atmospheric,
latitude, // latitude,
longitude, // longitude,
closestStnDist, // closestStnDist,
closestStnName, // closestStnName,
accessCount // accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, // ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name, // r.name,
r.url, // r.url,
r.feats.delicious, // r.feats.delicious,
r.feats.accommodating, // r.feats.accommodating,
r.feats.affordable, // r.feats.affordable,
r.feats.atmospheric, // r.feats.atmospheric,
r.latitude, // r.latitude,
r.longitude, // r.longitude,
r.closestStnDist, // r.closestStnDist,
r.closestStnName, // r.closestStnName,
0) // 0)
if err != nil { // if err != nil {
return err // return err
} // }
} // }
_, err = db.Exec(` // _, err = db.Exec(`
DROP TABLE IF EXISTS categories; // DROP TABLE IF EXISTS categories;
CREATE TABLE categories( // CREATE TABLE categories(
description VARCHAR(200) NOT NULL, // description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`) // id INTEGER PRIMARY KEY)`)
if err != nil { // if err != nil {
return err // return err
} // }
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { // for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { // if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err // return err
} // }
} // }
_, err = db.Exec(` // _, err = db.Exec(`
DROP TABLE IF EXISTS history; // DROP TABLE IF EXISTS history;
CREATE TABLE history( // CREATE TABLE history(
date DATETIME NOT NULL, // date DATETIME NOT NULL,
reviewId INTEGER NOT NULL, // reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY, // id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`) // FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil { // if err != nil {
return err // return err
} // }
_, err = db.Exec(` // _, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups; // DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups( // CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL, // categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL, // categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL, // historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id), // FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`) // FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil { // if err != nil {
return err // return err
} // }
return nil // return nil
} // }
func main() { func main() {
dbPath := flag.String("db", "data/db.sqlite3", "database output path") dbPath := flag.String("db", "data/db.sqlite3", "database output path")
@ -214,16 +210,16 @@ func main() {
webcachePath := flag.String("webcache", "cache/webcache", "web data cache") webcachePath := flag.String("webcache", "cache/webcache", "web data cache")
flag.Parse() flag.Parse()
restaurants, err := scrapeData(*urlsPath, *geocachePath, *webcachePath) reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
if err := computeStnData(restaurants, *stationsPath); err != nil { // if err := computeStnData(reviews, *stationsPath); err != nil {
log.Fatal(err) // log.Fatal(err)
} // }
if err := dumpData(*dbPath, restaurants); err != nil { // if err := dumpData(*dbPath, reviews); err != nil {
log.Fatal(err) // log.Fatal(err)
} // }
} }

View File

@ -79,19 +79,23 @@ func (c *geoCache) save() error {
return ioutil.WriteFile(c.filename, js, 0644) return ioutil.WriteFile(c.filename, js, 0644)
} }
func (c *geoCache) decode(address string) (geoPos, error) { func (c *geoCache) decode(address string) (latitude float64, longitude float64, err error) {
if pos, ok := c.data[address]; ok { if pos, ok := c.data[address]; ok {
return pos, nil latitude = pos.Latitude
longitude = pos.Longitude
return
} }
<-c.ticker.C <-c.ticker.C
point, err := c.coder.Geocode(address) point, err := c.coder.Geocode(address)
if err != nil { if err != nil {
return geoPos{}, err return
} }
pos := geoPos{point.Lat(), point.Lng()} latitude = point.Lat()
c.data[address] = pos longitude = point.Lng()
return pos, nil
c.data[address] = geoPos{latitude, longitude}
return
} }

View File

@ -30,19 +30,17 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type features struct { type feature struct {
delicious float64 value float64
accommodating float64 weight float64
affordable float64
atmospheric float64
} }
type restaurant struct { type review struct {
name string name string
address string address string
url string url string
feats features features map[string]feature
latitude float64 latitude float64
longitude float64 longitude float64
@ -53,7 +51,15 @@ type restaurant struct {
type scraper interface { type scraper interface {
index(doc *goquery.Document) (string, []string) index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, features, error) review(doc *goquery.Document) (string, string, map[string]feature, error)
}
type decoder interface {
decode(address string) (float64, float64, error)
}
type loader interface {
load(url string) (*goquery.Document, error)
} }
func makeAbsUrl(ref, base string) (string, error) { func makeAbsUrl(ref, base string) (string, error) {
@ -70,13 +76,12 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil return b.ResolveReference(r).String(), nil
} }
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) { func decodeReviews(in chan review, out chan review, dec decoder) {
for { for {
if res, ok := <-in; ok { if res, ok := <-in; ok {
pos, err := gc.decode(res.address) var err error
res.latitude, res.longitude, err = dec.decode(res.address)
if err == nil { if err == nil {
res.latitude = pos.Latitude
res.longitude = pos.Longitude
out <- res out <- res
} else { } else {
log.Printf("failed to decode address for %s (%v)", res.url, err) log.Printf("failed to decode address for %s (%v)", res.url, err)
@ -88,30 +93,30 @@ func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
} }
} }
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) { func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
defer group.Done() defer group.Done()
doc, err := wc.load(url) doc, err := lod.load(url)
if err != nil { if err != nil {
log.Printf("failed to load review at %s (%v)", url, err) log.Printf("failed to load review at %s (%v)", url, err)
return return
} }
name, address, feats, err := scr.review(doc) name, address, features, err := scr.review(doc)
if err != nil { if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err) log.Printf("failed to scrape review at %s (%v)", url, err)
return return
} }
out <- restaurant{ out <- review{
name: name, name: name,
address: address, address: address,
feats: feats, features: features,
url: url} url: url}
} }
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
doc, err := wc.load(indexUrl) doc, err := lod.load(indexUrl)
if err != nil { if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err) log.Printf("failed to load index at %s (%v)", indexUrl, err)
return return
@ -130,7 +135,7 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
} }
group.Add(1) group.Add(1)
go scrapeReview(absUrl, out, wc, &group, scr) go scrapeReview(absUrl, out, lod, scr, &group)
} }
group.Wait() group.Wait()
@ -142,18 +147,18 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
log.Fatal(err) log.Fatal(err)
} }
scrapeIndex(absUrl, out, wc, scr) scrapeIndex(absUrl, out, lod, scr)
} }
} }
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant { func scrape(url string, lod loader, dec decoder, scr scraper) []review {
out := make(chan restaurant, 128) out := make(chan review, 128)
in := make(chan restaurant, 128) in := make(chan review, 128)
go scrapeIndex(url, in, wc, scr) go scrapeIndex(url, in, lod, scr)
go decodeReviews(in, out, gc) go decodeReviews(in, out, dec)
var results []restaurant var results []review
for { for {
if res, ok := <-out; ok { if res, ok := <-out; ok {
results = append(results, res) results = append(results, res)

View File

@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls return nextIndexUrl, reviewUrls
} }
func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) { func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
name = doc.Find("a.rd-header__rst-name-main").Text() name = doc.Find("a.rd-header__rst-name-main").Text()
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
@ -60,19 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, feat feature
return return
} }
f := make(map[string]float64) features = make(map[string]feature)
for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} { for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} {
text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text() valueText := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
if f[category], err = strconv.ParseFloat(text, 8); err != nil {
var value float64
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category) err = fmt.Errorf("invalid value for %s", category)
return return
} }
}
feat.accommodating = f["service"]/2.5 - 1.0 features[category] = feature{value/2.5 - 1.0, 1.0}
feat.affordable = f["cost"]/2.5 - 1.0 }
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["dishes"]/2.5 - 1.0
return return
} }

View File

@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls return nextIndexUrl, reviewUrls
} }
func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) { func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
name = strings.TrimSpace(doc.Find("h1#HEADING").Text()) name = strings.TrimSpace(doc.Find("h1#HEADING").Text())
address = strings.TrimSpace(doc.Find("address span.format_address").Text()) address = strings.TrimSpace(doc.Find("address span.format_address").Text())
@ -60,20 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, feat fea
return return
} }
f := make(map[string]float64) features = make(map[string]feature)
for index, category := range []string{"food", "service", "value", "atmosphere"} { for index, category := range []string{"food", "service", "value", "atmosphere"} {
alt, _ := ratings.Eq(index).Attr("alt") altText, _ := ratings.Eq(index).Attr("alt")
rating := strings.Split(alt, " ")[0] valueText := strings.Split(altText, " ")[0]
if f[category], err = strconv.ParseFloat(rating, 8); err != nil {
var value float64
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category) err = fmt.Errorf("invalid value for %s", category)
return return
} }
}
feat.accommodating = f["service"]/2.5 - 1.0 features[category] = feature{value/2.5 - 1.0, 1.0}
feat.affordable = f["value"]/2.5 - 1.0 }
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["food"]/2.5 - 1.0
return return
} }