diff --git a/scrape.go b/scrape.go index 2ac3d23..238d404 100644 --- a/scrape.go +++ b/scrape.go @@ -25,7 +25,7 @@ package main import "log" func main() { - urlFormat := "http://tabelog.com/en/rstLst/{{.Page}}/?lat=35.465808055555996&lon=139.61964361111&zoom=16&RdoCosTp=2&LstCos=0&LstCosT=11&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0" + urlFormat := "http://tabelog.com/en/kanagawa/rstLst/{{.Page}}/" tabelogReviews := "tabelog.json" if err := scrapeTabelog(tabelogReviews, urlFormat); err != nil { diff --git a/tabelog.go b/tabelog.go index 5a5d211..668afcf 100644 --- a/tabelog.go +++ b/tabelog.go @@ -29,6 +29,7 @@ import ( "log" "strconv" "strings" + "sync" "text/template" "github.com/PuerkitoBio/goquery" @@ -51,53 +52,6 @@ type tabelogReview struct { Url string } -func scrapeReview(url string, out chan tabelogReview) { - doc, err := goquery.NewDocument(url) - if err != nil { - log.Print(err) - return - } - - var r tabelogReview - - r.Url = url - r.Name = doc.Find("body > article > header > div.rd-header.l-container > div > div.rd-header__headline > h2 > a").Text() - r.Address = strings.TrimSpace(doc.Find("#anchor-rd-detail > section > table > tbody > tr > td > p.rd-detail-info__rst-address > span").Text()) - - if r.Dishes, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { - return - } - if r.Service, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(4)").Text(), 8); err != nil { - return - } - if r.Atmosphere, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(6)").Text(), 8); err != nil { - return - } - if r.Cost, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(8)").Text(), 8); err != nil { - return - } - if r.Drinks, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(10)").Text(), 8); err != nil { - return - } - - out <- r -} - -func scrapeIndex(url string, out chan tabelogReview) error { - doc, err := goquery.NewDocument(url) - if err != nil { - return err - } - - doc.Find("#js-map-search-result-list > li > div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) { - if href, ok := sel.Attr("href"); ok { - go scrapeReview(href, out) - } - }) - - return nil -} - func dumpReviews(filename string, in chan tabelogReview, out chan error) { var reviews []tabelogReview for { @@ -118,6 +72,62 @@ func dumpReviews(filename string, in chan tabelogReview, out chan error) { out <- ioutil.WriteFile(filename, js, 0644) } +func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup) { + defer wg.Done() + + doc, err := goquery.NewDocument(url) + if err != nil { + log.Print(err) + return + } + + addresses := doc.Find("p.rd-detail-info__rst-address") + if addresses.Length() != 2 { + return + } + + var review tabelogReview + + review.Url = url + review.Name = doc.Find("div.rd-header__headline > h2 > a").Text() + review.Address = strings.TrimSpace(addresses.First().Text()) + + if review.Dishes, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { + return + } + if review.Service, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(4)").Text(), 8); err != nil { + return + } + if review.Atmosphere, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(6)").Text(), 8); err != nil { + return + } + if review.Cost, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(8)").Text(), 8); err != nil { + return + } + if review.Drinks, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(10)").Text(), 8); err != nil { + return + } + + out <- review +} + +func scrapeIndex(url string, out chan tabelogReview, wg *sync.WaitGroup) { + defer wg.Done() + + doc, err := goquery.NewDocument(url) + if err != nil { + log.Print(err) + return + } + + doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) { + if href, ok := sel.Attr("href"); ok { + wg.Add(1) + go scrapeReview(href, out, wg) + } + }) +} + func scrapeTabelog(filename, url string) error { out := make(chan tabelogReview) in := make(chan error) @@ -126,17 +136,19 @@ func scrapeTabelog(filename, url string) error { t := template.New("tabelog") t.Parse(url) - for i := 1; i <= 2; i++ { + var wg sync.WaitGroup + for i := 1; i <= 1; i++ { var url bytes.Buffer if err := t.Execute(&url, tabelogParams{i}); err != nil { log.Fatal(err) } - if err := scrapeIndex(string(url.Bytes()), out); err != nil { - return err - } + wg.Add(1) + go scrapeIndex(string(url.Bytes()), out, &wg) } + wg.Wait() + close(out) return <-in }