1

More cleanup

This commit is contained in:
Alex Yatskov 2015-08-17 14:16:07 +09:00
parent 5d4089581e
commit 6c5b62ece5
2 changed files with 35 additions and 34 deletions

View File

@ -37,5 +37,13 @@ func main() {
} }
t := tabelog{} t := tabelog{}
scrape("http://tabelog.com/en/kanagawa/rstLst/1/", wc, gc, t)
out := make(chan restaurant)
scrape("http://tabelog.com/en/kanagawa/rstLst/1/", out, wc, gc, t)
for {
if _, ok := <-out; !ok {
return
}
}
} }

View File

@ -30,7 +30,7 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type review struct { type restaurant struct {
name string name string
address string address string
url string url string
@ -41,7 +41,7 @@ type review struct {
longitude float64 longitude float64
} }
type profiler interface { type scraper interface {
index(doc *goquery.Document) (string, []string) index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]float64, error) review(doc *goquery.Document) (string, string, map[string]float64, error)
} }
@ -60,16 +60,16 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil return b.ResolveReference(r).String(), nil
} }
func decodeReviews(in chan review, out chan review, cache *geoCache) { func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
for { for {
if r, ok := <-in; ok { if res, ok := <-in; ok {
pos, err := cache.decode(r.address) pos, err := gc.decode(res.address)
if err == nil { if err == nil {
r.latitude = pos.Latitude res.latitude = pos.Latitude
r.longitude = pos.Longitude res.longitude = pos.Longitude
out <- r out <- res
} else { } else {
log.Printf("failed to decode address for %s (%v)", r.url, err) log.Printf("failed to decode address for %s (%v)", res.url, err)
} }
} else { } else {
close(out) close(out)
@ -78,32 +78,36 @@ func decodeReviews(in chan review, out chan review, cache *geoCache) {
} }
} }
func scrapeReview(url string, out chan review, cache *webCache, group *sync.WaitGroup, prof profiler) { func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
defer group.Done() defer group.Done()
doc, err := cache.load(url) doc, err := wc.load(url)
if err != nil { if err != nil {
log.Printf("failed to load review at %s (%v)", url, err) log.Printf("failed to load review at %s (%v)", url, err)
return return
} }
name, address, features, err := prof.review(doc) name, address, features, err := scr.review(doc)
if err != nil { if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err) log.Printf("failed to scrape review at %s (%v)", url, err)
return return
} }
out <- review{name: name, address: address, features: features, url: url} out <- restaurant{
name: name,
address: address,
features: features,
url: url}
} }
func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) { func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
doc, err := cache.load(indexUrl) doc, err := wc.load(indexUrl)
if err != nil { if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err) log.Printf("failed to load index at %s (%v)", indexUrl, err)
return return
} }
nextIndexUrl, reviewUrls := prof.index(doc) nextIndexUrl, reviewUrls := scr.index(doc)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
@ -116,7 +120,7 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
} }
group.Add(1) group.Add(1)
go scrapeReview(absUrl, out, cache, &group, prof) go scrapeReview(absUrl, out, wc, &group, scr)
} }
group.Wait() group.Wait()
@ -128,23 +132,12 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
log.Fatal(err) log.Fatal(err)
} }
scrapeIndex(absUrl, out, cache, prof) scrapeIndex(absUrl, out, wc, scr)
} }
} }
func scrape(url string, wc *webCache, gc *geoCache, prof profiler) []review { func scrape(url string, out chan restaurant, wc *webCache, gc *geoCache, scr scraper) {
scrapeChan := make(chan review) in := make(chan restaurant)
decodeChan := make(chan review) go scrapeIndex(url, in, wc, scr)
go decodeReviews(in, out, gc)
go scrapeIndex(url, scrapeChan, wc, prof)
go decodeReviews(scrapeChan, decodeChan, gc)
var reviews []review
for {
if r, ok := <-decodeChan; ok {
reviews = append(reviews, r)
} else {
return reviews
}
}
} }