More cleanup
This commit is contained in:
parent
5d4089581e
commit
6c5b62ece5
10
scrape.go
10
scrape.go
@ -37,5 +37,13 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
t := tabelog{}
|
t := tabelog{}
|
||||||
scrape("http://tabelog.com/en/kanagawa/rstLst/1/", wc, gc, t)
|
|
||||||
|
out := make(chan restaurant)
|
||||||
|
scrape("http://tabelog.com/en/kanagawa/rstLst/1/", out, wc, gc, t)
|
||||||
|
|
||||||
|
for {
|
||||||
|
if _, ok := <-out; !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
59
scraper.go
59
scraper.go
@ -30,7 +30,7 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
type review struct {
|
type restaurant struct {
|
||||||
name string
|
name string
|
||||||
address string
|
address string
|
||||||
url string
|
url string
|
||||||
@ -41,7 +41,7 @@ type review struct {
|
|||||||
longitude float64
|
longitude float64
|
||||||
}
|
}
|
||||||
|
|
||||||
type profiler interface {
|
type scraper interface {
|
||||||
index(doc *goquery.Document) (string, []string)
|
index(doc *goquery.Document) (string, []string)
|
||||||
review(doc *goquery.Document) (string, string, map[string]float64, error)
|
review(doc *goquery.Document) (string, string, map[string]float64, error)
|
||||||
}
|
}
|
||||||
@ -60,16 +60,16 @@ func makeAbsUrl(ref, base string) (string, error) {
|
|||||||
return b.ResolveReference(r).String(), nil
|
return b.ResolveReference(r).String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func decodeReviews(in chan review, out chan review, cache *geoCache) {
|
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
|
||||||
for {
|
for {
|
||||||
if r, ok := <-in; ok {
|
if res, ok := <-in; ok {
|
||||||
pos, err := cache.decode(r.address)
|
pos, err := gc.decode(res.address)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
r.latitude = pos.Latitude
|
res.latitude = pos.Latitude
|
||||||
r.longitude = pos.Longitude
|
res.longitude = pos.Longitude
|
||||||
out <- r
|
out <- res
|
||||||
} else {
|
} else {
|
||||||
log.Printf("failed to decode address for %s (%v)", r.url, err)
|
log.Printf("failed to decode address for %s (%v)", res.url, err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
close(out)
|
close(out)
|
||||||
@ -78,32 +78,36 @@ func decodeReviews(in chan review, out chan review, cache *geoCache) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, out chan review, cache *webCache, group *sync.WaitGroup, prof profiler) {
|
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
|
||||||
defer group.Done()
|
defer group.Done()
|
||||||
|
|
||||||
doc, err := cache.load(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to load review at %s (%v)", url, err)
|
log.Printf("failed to load review at %s (%v)", url, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
name, address, features, err := prof.review(doc)
|
name, address, features, err := scr.review(doc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to scrape review at %s (%v)", url, err)
|
log.Printf("failed to scrape review at %s (%v)", url, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
out <- review{name: name, address: address, features: features, url: url}
|
out <- restaurant{
|
||||||
|
name: name,
|
||||||
|
address: address,
|
||||||
|
features: features,
|
||||||
|
url: url}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) {
|
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
|
||||||
doc, err := cache.load(indexUrl)
|
doc, err := wc.load(indexUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
nextIndexUrl, reviewUrls := prof.index(doc)
|
nextIndexUrl, reviewUrls := scr.index(doc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -116,7 +120,7 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
|
|||||||
}
|
}
|
||||||
|
|
||||||
group.Add(1)
|
group.Add(1)
|
||||||
go scrapeReview(absUrl, out, cache, &group, prof)
|
go scrapeReview(absUrl, out, wc, &group, scr)
|
||||||
}
|
}
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
||||||
@ -128,23 +132,12 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
scrapeIndex(absUrl, out, cache, prof)
|
scrapeIndex(absUrl, out, wc, scr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrape(url string, wc *webCache, gc *geoCache, prof profiler) []review {
|
func scrape(url string, out chan restaurant, wc *webCache, gc *geoCache, scr scraper) {
|
||||||
scrapeChan := make(chan review)
|
in := make(chan restaurant)
|
||||||
decodeChan := make(chan review)
|
go scrapeIndex(url, in, wc, scr)
|
||||||
|
go decodeReviews(in, out, gc)
|
||||||
go scrapeIndex(url, scrapeChan, wc, prof)
|
|
||||||
go decodeReviews(scrapeChan, decodeChan, gc)
|
|
||||||
|
|
||||||
var reviews []review
|
|
||||||
for {
|
|
||||||
if r, ok := <-decodeChan; ok {
|
|
||||||
reviews = append(reviews, r)
|
|
||||||
} else {
|
|
||||||
return reviews
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user