From bcfeae55b8e26b54f172b639565625612bb9320b Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Thu, 17 Sep 2015 15:54:32 +0900 Subject: [PATCH] Refactor --- build/build.go | 238 +++++++++++++++++++++---------------------- build/geocache.go | 16 +-- build/scrape.go | 65 ++++++------ build/tabelog.go | 18 ++-- build/tripadvisor.go | 20 ++-- 5 files changed, 181 insertions(+), 176 deletions(-) diff --git a/build/build.go b/build/build.go index 41017bc..a61a84b 100644 --- a/build/build.go +++ b/build/build.go @@ -24,7 +24,6 @@ package main import ( "bufio" - "database/sql" "errors" "flag" "log" @@ -34,14 +33,14 @@ import ( _ "github.com/mattn/go-sqlite3" ) -func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant, error) { +func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) { file, err := os.Open(urlsPath) if err != nil { return nil, err } defer file.Close() - var results []restaurant + var reviews []review var scanner = bufio.NewScanner(file) for scanner.Scan() { @@ -51,24 +50,21 @@ func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]restaurant, return nil, err } - var items []restaurant switch parsed.Host { case "tabelog.com": - items = scrape(line, wc, gc, tabelog{}) + reviews = append(reviews, scrape(line, wc, gc, tabelog{})...) case "www.tripadvisor.com": - items = scrape(line, wc, gc, tripadvisor{}) + reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...) default: return nil, errors.New("unsupported review site") } - - results = append(results, items...) } } - return results, nil + return reviews, nil } -func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) { +func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { gc, err := newGeoCache(geocachePath) if err != nil { return nil, err @@ -80,131 +76,131 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, erro return nil, err } - restaurants, err := scrapeDataUrls(urlsPath, wc, gc) + reviews, err := scrapeDataUrls(urlsPath, wc, gc) if err != nil { return nil, err } - return restaurants, nil + return reviews, nil } -func computeStnData(restaurants []restaurant, stationsPath string) error { - sq, err := newStationQuery(stationsPath) - if err != nil { - return err - } +// func computeStnData(reviews []restaurant, stationsPath string) error { +// sq, err := newStationQuery(stationsPath) +// if err != nil { +// return err +// } - for i, _ := range restaurants { - r := &restaurants[i] - r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude) - } +// for i, _ := range reviews { +// r := &reviews[i] +// r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude) +// } - return nil -} +// return nil +// } -func dumpData(dbPath string, restaraunts []restaurant) error { - db, err := sql.Open("sqlite3", dbPath) - if err != nil { - return err - } - defer db.Close() +// func dumpData(dbPath string, restaraunts []restaurant) error { +// db, err := sql.Open("sqlite3", dbPath) +// if err != nil { +// return err +// } +// defer db.Close() - _, err = db.Exec(` - DROP TABLE IF EXISTS reviews; - CREATE TABLE reviews( - name VARCHAR(100) NOT NULL, - url VARCHAR(200) NOT NULL, - delicious FLOAT NOT NULL, - accommodating FLOAT NOT NULL, - affordable FLOAT NOT NULL, - atmospheric FLOAT NOT NULL, - latitude FLOAT NOT NULL, - longitude FLOAT NOT NULL, - closestStnDist FLOAT NOT NULL, - closestStnName VARCHAR(100) NOT NULL, - accessCount INTEGER NOT NULL, - id INTEGER PRIMARY KEY - )`) +// _, err = db.Exec(` +// DROP TABLE IF EXISTS reviews; +// CREATE TABLE reviews( +// name VARCHAR(100) NOT NULL, +// url VARCHAR(200) NOT NULL, +// delicious FLOAT NOT NULL, +// accommodating FLOAT NOT NULL, +// affordable FLOAT NOT NULL, +// atmospheric FLOAT NOT NULL, +// latitude FLOAT NOT NULL, +// longitude FLOAT NOT NULL, +// closestStnDist FLOAT NOT NULL, +// closestStnName VARCHAR(100) NOT NULL, +// accessCount INTEGER NOT NULL, +// id INTEGER PRIMARY KEY +// )`) - if err != nil { - return err - } +// if err != nil { +// return err +// } - for _, r := range restaraunts { - _, err = db.Exec(` - INSERT INTO reviews( - name, - url, - delicious, - accommodating, - affordable, - atmospheric, - latitude, - longitude, - closestStnDist, - closestStnName, - accessCount - ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - r.name, - r.url, - r.feats.delicious, - r.feats.accommodating, - r.feats.affordable, - r.feats.atmospheric, - r.latitude, - r.longitude, - r.closestStnDist, - r.closestStnName, - 0) +// for _, r := range restaraunts { +// _, err = db.Exec(` +// INSERT INTO reviews( +// name, +// url, +// delicious, +// accommodating, +// affordable, +// atmospheric, +// latitude, +// longitude, +// closestStnDist, +// closestStnName, +// accessCount +// ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, +// r.name, +// r.url, +// r.feats.delicious, +// r.feats.accommodating, +// r.feats.affordable, +// r.feats.atmospheric, +// r.latitude, +// r.longitude, +// r.closestStnDist, +// r.closestStnName, +// 0) - if err != nil { - return err - } - } +// if err != nil { +// return err +// } +// } - _, err = db.Exec(` - DROP TABLE IF EXISTS categories; - CREATE TABLE categories( - description VARCHAR(200) NOT NULL, - id INTEGER PRIMARY KEY)`) +// _, err = db.Exec(` +// DROP TABLE IF EXISTS categories; +// CREATE TABLE categories( +// description VARCHAR(200) NOT NULL, +// id INTEGER PRIMARY KEY)`) - if err != nil { - return err - } +// if err != nil { +// return err +// } - for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { - if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { - return err - } - } +// for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { +// if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { +// return err +// } +// } - _, err = db.Exec(` - DROP TABLE IF EXISTS history; - CREATE TABLE history( - date DATETIME NOT NULL, - reviewId INTEGER NOT NULL, - id INTEGER PRIMARY KEY, - FOREIGN KEY(reviewId) REFERENCES reviews(id))`) +// _, err = db.Exec(` +// DROP TABLE IF EXISTS history; +// CREATE TABLE history( +// date DATETIME NOT NULL, +// reviewId INTEGER NOT NULL, +// id INTEGER PRIMARY KEY, +// FOREIGN KEY(reviewId) REFERENCES reviews(id))`) - if err != nil { - return err - } +// if err != nil { +// return err +// } - _, err = db.Exec(` - DROP TABLE IF EXISTS historyGroups; - CREATE TABLE historyGroups( - categoryId INTEGER NOT NULL, - categoryValue FLOAT NOT NULL, - historyId INTEGER NOT NULL, - FOREIGN KEY(historyId) REFERENCES history(id), - FOREIGN KEY(categoryId) REFERENCES categories(id))`) +// _, err = db.Exec(` +// DROP TABLE IF EXISTS historyGroups; +// CREATE TABLE historyGroups( +// categoryId INTEGER NOT NULL, +// categoryValue FLOAT NOT NULL, +// historyId INTEGER NOT NULL, +// FOREIGN KEY(historyId) REFERENCES history(id), +// FOREIGN KEY(categoryId) REFERENCES categories(id))`) - if err != nil { - return err - } +// if err != nil { +// return err +// } - return nil -} +// return nil +// } func main() { dbPath := flag.String("db", "data/db.sqlite3", "database output path") @@ -214,16 +210,16 @@ func main() { webcachePath := flag.String("webcache", "cache/webcache", "web data cache") flag.Parse() - restaurants, err := scrapeData(*urlsPath, *geocachePath, *webcachePath) + reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath) if err != nil { log.Fatal(err) } - if err := computeStnData(restaurants, *stationsPath); err != nil { - log.Fatal(err) - } + // if err := computeStnData(reviews, *stationsPath); err != nil { + // log.Fatal(err) + // } - if err := dumpData(*dbPath, restaurants); err != nil { - log.Fatal(err) - } + // if err := dumpData(*dbPath, reviews); err != nil { + // log.Fatal(err) + // } } diff --git a/build/geocache.go b/build/geocache.go index bef53af..02981a5 100644 --- a/build/geocache.go +++ b/build/geocache.go @@ -79,19 +79,23 @@ func (c *geoCache) save() error { return ioutil.WriteFile(c.filename, js, 0644) } -func (c *geoCache) decode(address string) (geoPos, error) { +func (c *geoCache) decode(address string) (latitude float64, longitude float64, err error) { if pos, ok := c.data[address]; ok { - return pos, nil + latitude = pos.Latitude + longitude = pos.Longitude + return } <-c.ticker.C point, err := c.coder.Geocode(address) if err != nil { - return geoPos{}, err + return } - pos := geoPos{point.Lat(), point.Lng()} - c.data[address] = pos - return pos, nil + latitude = point.Lat() + longitude = point.Lng() + + c.data[address] = geoPos{latitude, longitude} + return } diff --git a/build/scrape.go b/build/scrape.go index 99c726d..1476063 100644 --- a/build/scrape.go +++ b/build/scrape.go @@ -30,19 +30,17 @@ import ( "github.com/PuerkitoBio/goquery" ) -type features struct { - delicious float64 - accommodating float64 - affordable float64 - atmospheric float64 +type feature struct { + value float64 + weight float64 } -type restaurant struct { +type review struct { name string address string url string - feats features + features map[string]feature latitude float64 longitude float64 @@ -53,7 +51,15 @@ type restaurant struct { type scraper interface { index(doc *goquery.Document) (string, []string) - review(doc *goquery.Document) (string, string, features, error) + review(doc *goquery.Document) (string, string, map[string]feature, error) +} + +type decoder interface { + decode(address string) (float64, float64, error) +} + +type loader interface { + load(url string) (*goquery.Document, error) } func makeAbsUrl(ref, base string) (string, error) { @@ -70,13 +76,12 @@ func makeAbsUrl(ref, base string) (string, error) { return b.ResolveReference(r).String(), nil } -func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) { +func decodeReviews(in chan review, out chan review, dec decoder) { for { if res, ok := <-in; ok { - pos, err := gc.decode(res.address) + var err error + res.latitude, res.longitude, err = dec.decode(res.address) if err == nil { - res.latitude = pos.Latitude - res.longitude = pos.Longitude out <- res } else { log.Printf("failed to decode address for %s (%v)", res.url, err) @@ -88,30 +93,30 @@ func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) { } } -func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) { +func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) { defer group.Done() - doc, err := wc.load(url) + doc, err := lod.load(url) if err != nil { log.Printf("failed to load review at %s (%v)", url, err) return } - name, address, feats, err := scr.review(doc) + name, address, features, err := scr.review(doc) if err != nil { log.Printf("failed to scrape review at %s (%v)", url, err) return } - out <- restaurant{ - name: name, - address: address, - feats: feats, - url: url} + out <- review{ + name: name, + address: address, + features: features, + url: url} } -func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { - doc, err := wc.load(indexUrl) +func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) { + doc, err := lod.load(indexUrl) if err != nil { log.Printf("failed to load index at %s (%v)", indexUrl, err) return @@ -130,7 +135,7 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper } group.Add(1) - go scrapeReview(absUrl, out, wc, &group, scr) + go scrapeReview(absUrl, out, lod, scr, &group) } group.Wait() @@ -142,18 +147,18 @@ func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper log.Fatal(err) } - scrapeIndex(absUrl, out, wc, scr) + scrapeIndex(absUrl, out, lod, scr) } } -func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant { - out := make(chan restaurant, 128) - in := make(chan restaurant, 128) +func scrape(url string, lod loader, dec decoder, scr scraper) []review { + out := make(chan review, 128) + in := make(chan review, 128) - go scrapeIndex(url, in, wc, scr) - go decodeReviews(in, out, gc) + go scrapeIndex(url, in, lod, scr) + go decodeReviews(in, out, dec) - var results []restaurant + var results []review for { if res, ok := <-out; ok { results = append(results, res) diff --git a/build/tabelog.go b/build/tabelog.go index 22afd98..0598f78 100644 --- a/build/tabelog.go +++ b/build/tabelog.go @@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) { return nextIndexUrl, reviewUrls } -func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) { +func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) { name = doc.Find("a.rd-header__rst-name-main").Text() if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { @@ -60,19 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, feat feature return } - f := make(map[string]float64) + features = make(map[string]feature) + for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} { - text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text() - if f[category], err = strconv.ParseFloat(text, 8); err != nil { + valueText := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text() + + var value float64 + if value, err = strconv.ParseFloat(valueText, 8); err != nil { err = fmt.Errorf("invalid value for %s", category) return } - } - feat.accommodating = f["service"]/2.5 - 1.0 - feat.affordable = f["cost"]/2.5 - 1.0 - feat.atmospheric = f["atmosphere"]/2.5 - 1.0 - feat.delicious = f["dishes"]/2.5 - 1.0 + features[category] = feature{value/2.5 - 1.0, 1.0} + } return } diff --git a/build/tripadvisor.go b/build/tripadvisor.go index 1edfecd..5931af9 100644 --- a/build/tripadvisor.go +++ b/build/tripadvisor.go @@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) { return nextIndexUrl, reviewUrls } -func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) { +func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]feature, err error) { name = strings.TrimSpace(doc.Find("h1#HEADING").Text()) address = strings.TrimSpace(doc.Find("address span.format_address").Text()) @@ -60,20 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, feat fea return } - f := make(map[string]float64) + features = make(map[string]feature) + for index, category := range []string{"food", "service", "value", "atmosphere"} { - alt, _ := ratings.Eq(index).Attr("alt") - rating := strings.Split(alt, " ")[0] - if f[category], err = strconv.ParseFloat(rating, 8); err != nil { + altText, _ := ratings.Eq(index).Attr("alt") + valueText := strings.Split(altText, " ")[0] + + var value float64 + if value, err = strconv.ParseFloat(valueText, 8); err != nil { err = fmt.Errorf("invalid value for %s", category) return } - } - feat.accommodating = f["service"]/2.5 - 1.0 - feat.affordable = f["value"]/2.5 - 1.0 - feat.atmospheric = f["atmosphere"]/2.5 - 1.0 - feat.delicious = f["food"]/2.5 - 1.0 + features[category] = feature{value/2.5 - 1.0, 1.0} + } return }