From 6138b978d4deea20101cef275b661cefe52951e7 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 16 Aug 2015 19:30:45 +0900 Subject: [PATCH] Scraping correctly again --- scraper.go | 7 +++---- tabelog.go | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scraper.go b/scraper.go index 953141f..fd36ed5 100644 --- a/scraper.go +++ b/scraper.go @@ -46,7 +46,7 @@ type profiler interface { profile(doc *goquery.Document) *review } -func makeAbsUrl(base, ref string) (string, error) { +func makeAbsUrl(ref, base string) (string, error) { b, err := url.Parse(base) if err != nil { return "", err @@ -85,13 +85,12 @@ func scrapeReview(url string, out chan review, cache *webCache, group *sync.Wait if err != nil { log.Printf("failed to scrape review at %s (%v)", url, err) } else if r := prof.profile(doc); r != nil { + r.url = url out <- *r } } func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) { - var group sync.WaitGroup - doc, err := cache.load(indexUrl) if err != nil { log.Printf("failed to scrape index at %s (%v)", indexUrl, err) @@ -103,6 +102,7 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile log.Fatal(err) } + var group sync.WaitGroup for _, reviewUrl := range reviewUrls { absUrl, err := makeAbsUrl(reviewUrl, indexUrl) if err != nil { @@ -112,7 +112,6 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile group.Add(1) go scrapeReview(absUrl, out, cache, &group, prof) } - group.Wait() if nextIndexUrl == "" { diff --git a/tabelog.go b/tabelog.go index 08833d8..be425e8 100644 --- a/tabelog.go +++ b/tabelog.go @@ -51,9 +51,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) { func (tabelog) profile(doc *goquery.Document) *review { var r review - r.url = doc.Url.String() r.name = doc.Find("a.rd-header__rst-name-main").Text() - if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { r.address = strings.TrimSpace(addresses.First().Text()) } else { @@ -61,6 +59,8 @@ func (tabelog) profile(doc *goquery.Document) *review { } var err error + + r.features = make(map[string]float64) if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { return nil }