From d0cf3364ef6d9cad13468e3b15387116f0a4b30a Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 16 Aug 2015 19:02:59 +0900 Subject: [PATCH] Making this be more modular --- scrape.go | 12 ---- scraper.go | 145 +++++++++++++++++++++++++++++++++++++++ tabelog.go | 198 ++++++++++------------------------------------------- 3 files changed, 181 insertions(+), 174 deletions(-) create mode 100644 scraper.go diff --git a/scrape.go b/scrape.go index 41e97a2..04552ed 100644 --- a/scrape.go +++ b/scrape.go @@ -22,17 +22,5 @@ package main -import "log" - func main() { - err := scrapeTabelog( - "http://tabelog.com/en/kanagawa/rstLst/1/", - "data/tabelog.json", - "cache/webcache", - "cache/geocache.json", - ) - - if err != nil { - log.Fatal(err) - } } diff --git a/scraper.go b/scraper.go new file mode 100644 index 0000000..2956e14 --- /dev/null +++ b/scraper.go @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2015 Alex Yatskov + * Author: Alex Yatskov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import ( + "log" + "net/url" + "sync" + + "github.com/PuerkitoBio/goquery" +) + +type review struct { + name string + address string + url string + + features map[string]float64 + + latitude float64 + longitude float64 +} + +type profiler interface { + index(doc *goquery.Document) (string, []string) + profile(doc *goquery.Document) review +} + +func makeAbsUrl(base, ref string) (string, error) { + b, err := url.Parse(base) + if err != nil { + return "", err + } + + r, err := url.Parse(ref) + if err != nil { + return "", err + } + + return b.ResolveReference(r).String(), nil +} + +func decodeReviews(in chan review, out chan review, cache *geoCache) { + for { + if r, ok := <-in; ok { + pos, err := cache.decode(r.address) + if err == nil { + r.latitude = pos.Latitude + r.longitude = pos.Longitude + out <- r + } else { + log.Printf("failed to decode address for %s (%v)", r.url, err) + } + } else { + close(out) + return + } + } +} + +func scrapeReview(url string, out chan review, cache *webCache, group *sync.WaitGroup, prof profiler) { + defer group.Done() + + doc, err := cache.load(url) + if err != nil { + log.Printf("failed to scrape review at %s (%v)", url, err) + } else { + out <- prof.profile(doc) + } +} + +func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) { + var group sync.WaitGroup + + doc, err := cache.load(indexUrl) + if err != nil { + log.Printf("failed to scrape index at %s (%v)", indexUrl, err) + return + } + + nextIndexUrl, reviewUrls := prof.index(doc) + if err != nil { + log.Fatal(err) + } + + for _, reviewUrl := range reviewUrls { + absUrl, err := makeAbsUrl(reviewUrl, indexUrl) + if err != nil { + log.Fatal(err) + } + + group.Add(1) + go scrapeReview(absUrl, out, cache, &group, prof) + } + + group.Wait() + + if nextIndexUrl == "" { + close(out) + } else { + absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl) + if err != nil { + log.Fatal(err) + } + + scrapeIndex(absUrl, out, cache, prof) + } +} + +func scrape(url string, wc *webCache, gc *geoCache, prof profiler) []review { + scrapeChan := make(chan review) + decodeChan := make(chan review) + + go scrapeIndex(url, scrapeChan, wc, prof) + go decodeReviews(scrapeChan, decodeChan, gc) + + var reviews []review + for { + if r, ok := <-decodeChan; ok { + reviews = append(reviews, r) + } else { + return reviews + } + } +} diff --git a/tabelog.go b/tabelog.go index d3fe208..67842ef 100644 --- a/tabelog.go +++ b/tabelog.go @@ -23,185 +23,59 @@ package main import ( - "encoding/json" - "io/ioutil" - "log" - "net/url" "strconv" "strings" - "sync" "github.com/PuerkitoBio/goquery" ) -type tabelogParams struct { - Page int +type tabelog struct { } -type tabelogReview struct { - Name string - Address string - Url string - - Dishes float64 - Service float64 - Atmosphere float64 - Cost float64 - Drinks float64 - - Latitude float64 - Longitude float64 -} - -func makeAbsUrl(base, ref string) (string, error) { - b, err := url.Parse(base) - if err != nil { - return "", err - } - - r, err := url.Parse(ref) - if err != nil { - return "", err - } - - return b.ResolveReference(r).String(), nil -} - -func dumpReviews(filename string, in chan tabelogReview) error { - var reviews []tabelogReview - for { - if review, ok := <-in; ok { - reviews = append(reviews, review) - } else { - break - } - } - - js, err := json.MarshalIndent(reviews, "", " ") - if err != nil { - return err - } - - if err := ioutil.WriteFile(filename, js, 0644); err != nil { - return err - } - - return nil -} - -func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) { - for { - if review, ok := <-in; ok { - pos, err := gc.decode(review.Address) - if err == nil { - review.Latitude = pos.Latitude - review.Longitude = pos.Longitude - out <- review - } else { - log.Printf("failed to decode address for %s (%v)", review.Url, err) - } - } else { - close(out) - return - } - } -} - -func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) { - defer wg.Done() - - doc, err := wc.load(url) - if err != nil { - log.Printf("failed to scrape review at %s (%v)", url, err) - return - } - - addresses := doc.Find("p.rd-detail-info__rst-address") - if addresses.Length() != 2 { - return - } - - var review tabelogReview - - review.Url = url - review.Name = doc.Find("a.rd-header__rst-name-main").Text() - review.Address = strings.TrimSpace(addresses.First().Text()) - - if review.Dishes, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { - return - } - if review.Service, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(4)").Text(), 8); err != nil { - return - } - if review.Atmosphere, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(6)").Text(), 8); err != nil { - return - } - if review.Cost, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(8)").Text(), 8); err != nil { - return - } - if review.Drinks, err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(10)").Text(), 8); err != nil { - return - } - - out <- review -} - -func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) { - doc, err := wc.load(url) - if err != nil { - log.Printf("failed to scrape index at %s (%v)", url, err) - return - } - +func (t *tabelog) index(doc *goquery.Document) (string, []string) { + var reviewUrls []string doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) { if href, ok := sel.Attr("href"); ok { - wg.Add(1) - - absUrl, err := makeAbsUrl(url, href) - if err != nil { - log.Fatal(err) - } - - go scrapeReview(absUrl, out, wc, wg) + reviewUrls = append(reviewUrls, href) } }) + var nextIndexUrl string if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok { - absUrl, err := makeAbsUrl(url, href) - if err != nil { - log.Fatal(err) - } - - scrapeIndex(absUrl, out, wc, wg) - } -} - -func scrapeReviews(url string, out chan tabelogReview, wc *webCache) error { - var wg sync.WaitGroup - scrapeIndex(url, out, wc, &wg) - wg.Wait() - - close(out) - return nil -} - -func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error { - wc, err := newWebCache(webCacheDir) - if err != nil { - return err + nextIndexUrl = href } - gc, err := newGeoCache(geoCacheFile) - if err != nil { - return err + return nextIndexUrl, reviewUrls +} + +func (t *tabelog) profile(doc *goquery.Document) *review { + var r review + + r.url = doc.Url.String() + r.name = doc.Find("a.rd-header__rst-name-main").Text() + + if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { + r.address = strings.TrimSpace(addresses.First().Text()) + } else { + return nil } - scrapeChan := make(chan tabelogReview, 2000) - decodeChan := make(chan tabelogReview, 2000) + var err error + if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { + return nil + } + if r.features["service"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(4)").Text(), 8); err != nil { + return nil + } + if r.features["atmosphere"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(6)").Text(), 8); err != nil { + return nil + } + if r.features["cost"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(8)").Text(), 8); err != nil { + return nil + } + if r.features["drinks"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(10)").Text(), 8); err != nil { + return nil + } - go decodeReviews(scrapeChan, decodeChan, gc) - scrapeReviews(url, scrapeChan, wc) - dumpReviews(resultFile, decodeChan) - - return gc.save() + return &r }