From b5ee8380a0959eb908db89fe31f4631829e71fab Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Mon, 24 Aug 2015 17:03:00 +0900 Subject: [PATCH] Modifying data range --- build/build.go | 14 ++++---------- build/scrape.go | 21 ++++++++++++++------- build/tabelog.go | 11 ++++++++--- build/tripadvisor.go | 11 ++++++++--- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/build/build.go b/build/build.go index 35575c9..c2d1ea6 100644 --- a/build/build.go +++ b/build/build.go @@ -102,10 +102,6 @@ func computeStnData(restaurants []restaurant, stationsPath string) error { return nil } -func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) { - return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"] -} - func dumpData(dbPath string, restaraunts []restaurant) error { db, err := sql.Open("sqlite3", dbPath) if err != nil { @@ -135,8 +131,6 @@ func dumpData(dbPath string, restaraunts []restaurant) error { } for _, r := range restaraunts { - delicious, accommodating, affordable, atmospheric := buildFeatures(r) - _, err = db.Exec(` INSERT INTO reviews( name, @@ -153,10 +147,10 @@ func dumpData(dbPath string, restaraunts []restaurant) error { ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, r.name, r.url, - delicious, - accommodating, - affordable, - atmospheric, + r.feats.delicious, + r.feats.accommodating, + r.feats.affordable, + r.feats.atmospheric, r.longitude, r.latitude, r.closestStnDist, diff --git a/build/scrape.go b/build/scrape.go index 23a9c80..99c726d 100644 --- a/build/scrape.go +++ b/build/scrape.go @@ -30,12 +30,19 @@ import ( "github.com/PuerkitoBio/goquery" ) +type features struct { + delicious float64 + accommodating float64 + affordable float64 + atmospheric float64 +} + type restaurant struct { name string address string url string - features map[string]float64 + feats features latitude float64 longitude float64 @@ -46,7 +53,7 @@ type restaurant struct { type scraper interface { index(doc *goquery.Document) (string, []string) - review(doc *goquery.Document) (string, string, map[string]float64, error) + review(doc *goquery.Document) (string, string, features, error) } func makeAbsUrl(ref, base string) (string, error) { @@ -90,17 +97,17 @@ func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.Wai return } - name, address, features, err := scr.review(doc) + name, address, feats, err := scr.review(doc) if err != nil { log.Printf("failed to scrape review at %s (%v)", url, err) return } out <- restaurant{ - name: name, - address: address, - features: features, - url: url} + name: name, + address: address, + feats: feats, + url: url} } func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { diff --git a/build/tabelog.go b/build/tabelog.go index ad8abc1..22afd98 100644 --- a/build/tabelog.go +++ b/build/tabelog.go @@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) { return nextIndexUrl, reviewUrls } -func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) { +func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) { name = doc.Find("a.rd-header__rst-name-main").Text() if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { @@ -60,14 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, features map return } - features = make(map[string]float64) + f := make(map[string]float64) for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} { text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text() - if features[category], err = strconv.ParseFloat(text, 8); err != nil { + if f[category], err = strconv.ParseFloat(text, 8); err != nil { err = fmt.Errorf("invalid value for %s", category) return } } + feat.accommodating = f["service"]/2.5 - 1.0 + feat.affordable = f["cost"]/2.5 - 1.0 + feat.atmospheric = f["atmosphere"]/2.5 - 1.0 + feat.delicious = f["dishes"]/2.5 - 1.0 + return } diff --git a/build/tripadvisor.go b/build/tripadvisor.go index eaf60cb..1edfecd 100644 --- a/build/tripadvisor.go +++ b/build/tripadvisor.go @@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) { return nextIndexUrl, reviewUrls } -func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) { +func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) { name = strings.TrimSpace(doc.Find("h1#HEADING").Text()) address = strings.TrimSpace(doc.Find("address span.format_address").Text()) @@ -60,15 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, features return } - features = make(map[string]float64) + f := make(map[string]float64) for index, category := range []string{"food", "service", "value", "atmosphere"} { alt, _ := ratings.Eq(index).Attr("alt") rating := strings.Split(alt, " ")[0] - if features[category], err = strconv.ParseFloat(rating, 8); err != nil { + if f[category], err = strconv.ParseFloat(rating, 8); err != nil { err = fmt.Errorf("invalid value for %s", category) return } } + feat.accommodating = f["service"]/2.5 - 1.0 + feat.affordable = f["value"]/2.5 - 1.0 + feat.atmospheric = f["atmosphere"]/2.5 - 1.0 + feat.delicious = f["food"]/2.5 - 1.0 + return }