From a51d82d1ea929a752ec59073115365329d0720eb Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Thu, 17 Sep 2015 16:37:08 +0900 Subject: [PATCH] WIP --- build/build.go | 63 ++++++++++++++++++++++++-------------------- build/scrape.go | 28 ++++++++------------ build/tabelog.go | 1 + build/tripadvisor.go | 1 + 4 files changed, 48 insertions(+), 45 deletions(-) diff --git a/build/build.go b/build/build.go index a61a84b..f532e8f 100644 --- a/build/build.go +++ b/build/build.go @@ -30,38 +30,21 @@ import ( "net/url" "os" + "github.com/PuerkitoBio/goquery" _ "github.com/mattn/go-sqlite3" ) -func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) { - file, err := os.Open(urlsPath) - if err != nil { - return nil, err - } - defer file.Close() +type scrapeCtx struct { + gc *geoCache + wc *webCache +} - var reviews []review - var scanner = bufio.NewScanner(file) +func (s scrapeCtx) decode(address string) (float64, float64, error) { + return s.gc.decode(address) +} - for scanner.Scan() { - if line := scanner.Text(); len(line) > 0 { - parsed, err := url.Parse(line) - if err != nil { - return nil, err - } - - switch parsed.Host { - case "tabelog.com": - reviews = append(reviews, scrape(line, wc, gc, tabelog{})...) - case "www.tripadvisor.com": - reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...) - default: - return nil, errors.New("unsupported review site") - } - } - } - - return reviews, nil +func (s scrapeCtx) load(url string) (*goquery.Document, error) { + return s.wc.load(url) } func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { @@ -76,10 +59,34 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { return nil, err } - reviews, err := scrapeDataUrls(urlsPath, wc, gc) + file, err := os.Open(urlsPath) if err != nil { return nil, err } + defer file.Close() + + var ( + ctx = scrapeCtx{gc, wc} + reviews []review + ) + + for scanner := bufio.NewScanner(file); scanner.Scan(); { + if line := scanner.Text(); len(line) > 0 { + parsed, err := url.Parse(line) + if err != nil { + return nil, err + } + + switch parsed.Host { + case "tabelog.com": + reviews = append(reviews, scrape(line, tabelog{scrapeCtx: ctx})...) + case "www.tripadvisor.com": + reviews = append(reviews, scrape(line, tripadvisor{scrapeCtx: ctx})...) + default: + return nil, errors.New("unsupported review site") + } + } + } return reviews, nil } diff --git a/build/scrape.go b/build/scrape.go index 1476063..3bed28c 100644 --- a/build/scrape.go +++ b/build/scrape.go @@ -52,13 +52,7 @@ type review struct { type scraper interface { index(doc *goquery.Document) (string, []string) review(doc *goquery.Document) (string, string, map[string]feature, error) -} - -type decoder interface { decode(address string) (float64, float64, error) -} - -type loader interface { load(url string) (*goquery.Document, error) } @@ -76,11 +70,11 @@ func makeAbsUrl(ref, base string) (string, error) { return b.ResolveReference(r).String(), nil } -func decodeReviews(in chan review, out chan review, dec decoder) { +func decodeReviews(in chan review, out chan review, scr scraper) { for { if res, ok := <-in; ok { var err error - res.latitude, res.longitude, err = dec.decode(res.address) + res.latitude, res.longitude, err = scr.decode(res.address) if err == nil { out <- res } else { @@ -93,10 +87,10 @@ func decodeReviews(in chan review, out chan review, dec decoder) { } } -func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) { +func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) { defer group.Done() - doc, err := lod.load(url) + doc, err := scr.load(url) if err != nil { log.Printf("failed to load review at %s (%v)", url, err) return @@ -115,8 +109,8 @@ func scrapeReview(url string, out chan review, lod loader, scr scraper, group *s url: url} } -func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) { - doc, err := lod.load(indexUrl) +func scrapeIndex(indexUrl string, out chan review, scr scraper) { + doc, err := scr.load(indexUrl) if err != nil { log.Printf("failed to load index at %s (%v)", indexUrl, err) return @@ -135,7 +129,7 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) { } group.Add(1) - go scrapeReview(absUrl, out, lod, scr, &group) + go scrapeReview(absUrl, out, scr, &group) } group.Wait() @@ -147,16 +141,16 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) { log.Fatal(err) } - scrapeIndex(absUrl, out, lod, scr) + scrapeIndex(absUrl, out, scr) } } -func scrape(url string, lod loader, dec decoder, scr scraper) []review { +func scrape(url string, scr scraper) []review { out := make(chan review, 128) in := make(chan review, 128) - go scrapeIndex(url, in, lod, scr) - go decodeReviews(in, out, dec) + go scrapeIndex(url, in, scr) + go decodeReviews(in, out, scr) var results []review for { diff --git a/build/tabelog.go b/build/tabelog.go index 0598f78..690b9b1 100644 --- a/build/tabelog.go +++ b/build/tabelog.go @@ -32,6 +32,7 @@ import ( ) type tabelog struct { + scrapeCtx } func (tabelog) index(doc *goquery.Document) (string, []string) { diff --git a/build/tripadvisor.go b/build/tripadvisor.go index 5931af9..8194049 100644 --- a/build/tripadvisor.go +++ b/build/tripadvisor.go @@ -32,6 +32,7 @@ import ( ) type tripadvisor struct { + scrapeCtx } func (tripadvisor) index(doc *goquery.Document) (string, []string) {