1
This commit is contained in:
Alex Yatskov 2015-09-17 15:54:32 +09:00
parent f3566cab3c
commit bcfeae55b8
5 changed files with 181 additions and 176 deletions

View File

@ -24,7 +24,6 @@ package main
import (
"bufio"
"database/sql"
"errors"
"flag"
"log"
@ -34,14 +33,14 @@ import (
_ "github.com/mattn/go-sqlite3"
)
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant, error) {
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
file, err := os.Open(urlsPath)
if err != nil {
return nil, err
}
defer file.Close()
var results []restaurant
var reviews []review
var scanner = bufio.NewScanner(file)
for scanner.Scan() {
@ -51,24 +50,21 @@ func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant,
return nil, err
}
var items []restaurant
switch parsed.Host {
case "tabelog.com":
items = scrape(line, wc, gc, tabelog{})
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{})
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
default:
return nil, errors.New("unsupported review site")
}
results = append(results, items...)
}
}
return results, nil
return reviews, nil
}
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
gc, err := newGeoCache(geocachePath)
if err != nil {
return nil, err
@ -80,131 +76,131 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, erro
return nil, err
}
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
reviews, err := scrapeDataUrls(urlsPath, wc, gc)
if err != nil {
return nil, err
}
return restaurants, nil
return reviews, nil
}
func computeStnData(restaurants []restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
// func computeStnData(reviews []restaurant, stationsPath string) error {
// sq, err := newStationQuery(stationsPath)
// if err != nil {
// return err
// }
for i, _ := range restaurants {
r := &restaurants[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
}
// for i, _ := range reviews {
// r := &reviews[i]
// r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
// }
return nil
}
// return nil
// }
func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
// func dumpData(dbPath string, restaraunts []restaurant) error {
// db, err := sql.Open("sqlite3", dbPath)
// if err != nil {
// return err
// }
// defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
// _, err = db.Exec(`
// DROP TABLE IF EXISTS reviews;
// CREATE TABLE reviews(
// name VARCHAR(100) NOT NULL,
// url VARCHAR(200) NOT NULL,
// delicious FLOAT NOT NULL,
// accommodating FLOAT NOT NULL,
// affordable FLOAT NOT NULL,
// atmospheric FLOAT NOT NULL,
// latitude FLOAT NOT NULL,
// longitude FLOAT NOT NULL,
// closestStnDist FLOAT NOT NULL,
// closestStnName VARCHAR(100) NOT NULL,
// accessCount INTEGER NOT NULL,
// id INTEGER PRIMARY KEY
// )`)
if err != nil {
return err
}
// if err != nil {
// return err
// }
for _, r := range restaraunts {
_, err = db.Exec(`
INSERT INTO reviews(
name,
url,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name,
r.url,
r.feats.delicious,
r.feats.accommodating,
r.feats.affordable,
r.feats.atmospheric,
r.latitude,
r.longitude,
r.closestStnDist,
r.closestStnName,
0)
// for _, r := range restaraunts {
// _, err = db.Exec(`
// INSERT INTO reviews(
// name,
// url,
// delicious,
// accommodating,
// affordable,
// atmospheric,
// latitude,
// longitude,
// closestStnDist,
// closestStnName,
// accessCount
// ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
// r.name,
// r.url,
// r.feats.delicious,
// r.feats.accommodating,
// r.feats.affordable,
// r.feats.atmospheric,
// r.latitude,
// r.longitude,
// r.closestStnDist,
// r.closestStnName,
// 0)
if err != nil {
return err
}
}
// if err != nil {
// return err
// }
// }
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
// _, err = db.Exec(`
// DROP TABLE IF EXISTS categories;
// CREATE TABLE categories(
// description VARCHAR(200) NOT NULL,
// id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
// if err != nil {
// return err
// }
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
// for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
// if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
// return err
// }
// }
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
// _, err = db.Exec(`
// DROP TABLE IF EXISTS history;
// CREATE TABLE history(
// date DATETIME NOT NULL,
// reviewId INTEGER NOT NULL,
// id INTEGER PRIMARY KEY,
// FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
// if err != nil {
// return err
// }
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
// _, err = db.Exec(`
// DROP TABLE IF EXISTS historyGroups;
// CREATE TABLE historyGroups(
// categoryId INTEGER NOT NULL,
// categoryValue FLOAT NOT NULL,
// historyId INTEGER NOT NULL,
// FOREIGN KEY(historyId) REFERENCES history(id),
// FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
// if err != nil {
// return err
// }
return nil
}
// return nil
// }
func main() {
dbPath := flag.String("db", "data/db.sqlite3", "database output path")
@ -214,16 +210,16 @@ func main() {
webcachePath := flag.String("webcache", "cache/webcache", "web data cache")
flag.Parse()
restaurants, err := scrapeData(*urlsPath, *geocachePath, *webcachePath)
reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath)
if err != nil {
log.Fatal(err)
}
if err := computeStnData(restaurants, *stationsPath); err != nil {
log.Fatal(err)
}
// if err := computeStnData(reviews, *stationsPath); err != nil {
// log.Fatal(err)
// }
if err := dumpData(*dbPath, restaurants); err != nil {
log.Fatal(err)
}
// if err := dumpData(*dbPath, reviews); err != nil {
// log.Fatal(err)
// }
}

View File

@ -79,19 +79,23 @@ func (c *geoCache) save() error {
return ioutil.WriteFile(c.filename, js, 0644)
}
func (c *geoCache) decode(address string) (geoPos, error) {
func (c *geoCache) decode(address string) (latitude float64, longitude float64, err error) {
if pos, ok := c.data[address]; ok {
return pos, nil
latitude = pos.Latitude
longitude = pos.Longitude
return
}
<-c.ticker.C
point, err := c.coder.Geocode(address)
if err != nil {
return geoPos{}, err
return
}
pos := geoPos{point.Lat(), point.Lng()}
c.data[address] = pos
return pos, nil
latitude = point.Lat()
longitude = point.Lng()
c.data[address] = geoPos{latitude, longitude}
return
}

View File

@ -30,19 +30,17 @@ import (
"github.com/PuerkitoBio/goquery"
)
type features struct {
delicious float64
accommodating float64
affordable float64
atmospheric float64
type feature struct {
value float64
weight float64
}
type restaurant struct {
type review struct {
name string
address string
url string
feats features
features map[string]feature
latitude float64
longitude float64
@ -53,7 +51,15 @@ type restaurant struct {
type scraper interface {
index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, features, error)
review(doc *goquery.Document) (string, string, map[string]feature, error)
}
type decoder interface {
decode(address string) (float64, float64, error)
}
type loader interface {
load(url string) (*goquery.Document, error)
}
func makeAbsUrl(ref, base string) (string, error) {
@ -70,13 +76,12 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil
}
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
func decodeReviews(in chan review, out chan review, dec decoder) {
for {
if res, ok := <-in; ok {
pos, err := gc.decode(res.address)
var err error
res.latitude, res.longitude, err = dec.decode(res.address)
if err == nil {
res.latitude = pos.Latitude
res.longitude = pos.Longitude
out <- res
} else {
log.Printf("failed to decode address for %s (%v)", res.url, err)
@ -88,30 +93,30 @@ func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
}
}
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
defer group.Done()
doc, err := wc.load(url)
doc, err := lod.load(url)
if err != nil {
log.Printf("failed to load review at %s (%v)", url, err)
return
}
name, address, feats, err := scr.review(doc)
name, address, features, err := scr.review(doc)
if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err)
return
}
out <- restaurant{
out <- review{
name: name,
address: address,
feats: feats,
features: features,
url: url}
}
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
doc, err := wc.load(indexUrl)
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
doc, err := lod.load(indexUrl)
if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err)
return
@ -130,7 +135,7 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
}
group.Add(1)
go scrapeReview(absUrl, out, wc, &group, scr)
go scrapeReview(absUrl, out, lod, scr, &group)
}
group.Wait()
@ -142,18 +147,18 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper
log.Fatal(err)
}
scrapeIndex(absUrl, out, wc, scr)
scrapeIndex(absUrl, out, lod, scr)
}
}
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant {
out := make(chan restaurant, 128)
in := make(chan restaurant, 128)
func scrape(url string, lod loader, dec decoder, scr scraper) []review {
out := make(chan review, 128)
in := make(chan review, 128)
go scrapeIndex(url, in, wc, scr)
go decodeReviews(in, out, gc)
go scrapeIndex(url, in, lod, scr)
go decodeReviews(in, out, dec)
var results []restaurant
var results []review
for {
if res, ok := <-out; ok {
results = append(results, res)

View File

@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls
}
func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) {
func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
name = doc.Find("a.rd-header__rst-name-main").Text()
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
@ -60,19 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, feat feature
return
}
f := make(map[string]float64)
features = make(map[string]feature)
for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} {
text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
if f[category], err = strconv.ParseFloat(text, 8); err != nil {
valueText := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
var value float64
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category)
return
}
}
feat.accommodating = f["service"]/2.5 - 1.0
feat.affordable = f["cost"]/2.5 - 1.0
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["dishes"]/2.5 - 1.0
features[category] = feature{value/2.5 - 1.0, 1.0}
}
return
}

View File

@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls
}
func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) {
func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) {
name = strings.TrimSpace(doc.Find("h1#HEADING").Text())
address = strings.TrimSpace(doc.Find("address span.format_address").Text())
@ -60,20 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, feat fea
return
}
f := make(map[string]float64)
features = make(map[string]feature)
for index, category := range []string{"food", "service", "value", "atmosphere"} {
alt, _ := ratings.Eq(index).Attr("alt")
rating := strings.Split(alt, " ")[0]
if f[category], err = strconv.ParseFloat(rating, 8); err != nil {
altText, _ := ratings.Eq(index).Attr("alt")
valueText := strings.Split(altText, " ")[0]
var value float64
if value, err = strconv.ParseFloat(valueText, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category)
return
}
}
feat.accommodating = f["service"]/2.5 - 1.0
feat.affordable = f["value"]/2.5 - 1.0
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["food"]/2.5 - 1.0
features[category] = feature{value/2.5 - 1.0, 1.0}
}
return
}