1
This commit is contained in:
Alex Yatskov 2015-09-17 16:37:08 +09:00
parent bcfeae55b8
commit a51d82d1ea
4 changed files with 48 additions and 45 deletions

View File

@ -30,38 +30,21 @@ import (
"net/url" "net/url"
"os" "os"
"github.com/PuerkitoBio/goquery"
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"
) )
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) { type scrapeCtx struct {
file, err := os.Open(urlsPath) gc *geoCache
if err != nil { wc *webCache
return nil, err }
}
defer file.Close()
var reviews []review func (s scrapeCtx) decode(address string) (float64, float64, error) {
var scanner = bufio.NewScanner(file) return s.gc.decode(address)
}
for scanner.Scan() { func (s scrapeCtx) load(url string) (*goquery.Document, error) {
if line := scanner.Text(); len(line) > 0 { return s.wc.load(url)
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
switch parsed.Host {
case "tabelog.com":
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
case "www.tripadvisor.com":
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
default:
return nil, errors.New("unsupported review site")
}
}
}
return reviews, nil
} }
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
@ -76,10 +59,34 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
return nil, err return nil, err
} }
reviews, err := scrapeDataUrls(urlsPath, wc, gc) file, err := os.Open(urlsPath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer file.Close()
var (
ctx = scrapeCtx{gc, wc}
reviews []review
)
for scanner := bufio.NewScanner(file); scanner.Scan(); {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
switch parsed.Host {
case "tabelog.com":
reviews = append(reviews, scrape(line, tabelog{scrapeCtx: ctx})...)
case "www.tripadvisor.com":
reviews = append(reviews, scrape(line, tripadvisor{scrapeCtx: ctx})...)
default:
return nil, errors.New("unsupported review site")
}
}
}
return reviews, nil return reviews, nil
} }

View File

@ -52,13 +52,7 @@ type review struct {
type scraper interface { type scraper interface {
index(doc *goquery.Document) (string, []string) index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]feature, error) review(doc *goquery.Document) (string, string, map[string]feature, error)
}
type decoder interface {
decode(address string) (float64, float64, error) decode(address string) (float64, float64, error)
}
type loader interface {
load(url string) (*goquery.Document, error) load(url string) (*goquery.Document, error)
} }
@ -76,11 +70,11 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil return b.ResolveReference(r).String(), nil
} }
func decodeReviews(in chan review, out chan review, dec decoder) { func decodeReviews(in chan review, out chan review, scr scraper) {
for { for {
if res, ok := <-in; ok { if res, ok := <-in; ok {
var err error var err error
res.latitude, res.longitude, err = dec.decode(res.address) res.latitude, res.longitude, err = scr.decode(res.address)
if err == nil { if err == nil {
out <- res out <- res
} else { } else {
@ -93,10 +87,10 @@ func decodeReviews(in chan review, out chan review, dec decoder) {
} }
} }
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) { func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) {
defer group.Done() defer group.Done()
doc, err := lod.load(url) doc, err := scr.load(url)
if err != nil { if err != nil {
log.Printf("failed to load review at %s (%v)", url, err) log.Printf("failed to load review at %s (%v)", url, err)
return return
@ -115,8 +109,8 @@ func scrapeReview(url string, out chan review, lod loader, scr scraper, group *s
url: url} url: url}
} }
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) { func scrapeIndex(indexUrl string, out chan review, scr scraper) {
doc, err := lod.load(indexUrl) doc, err := scr.load(indexUrl)
if err != nil { if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err) log.Printf("failed to load index at %s (%v)", indexUrl, err)
return return
@ -135,7 +129,7 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
} }
group.Add(1) group.Add(1)
go scrapeReview(absUrl, out, lod, scr, &group) go scrapeReview(absUrl, out, scr, &group)
} }
group.Wait() group.Wait()
@ -147,16 +141,16 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
log.Fatal(err) log.Fatal(err)
} }
scrapeIndex(absUrl, out, lod, scr) scrapeIndex(absUrl, out, scr)
} }
} }
func scrape(url string, lod loader, dec decoder, scr scraper) []review { func scrape(url string, scr scraper) []review {
out := make(chan review, 128) out := make(chan review, 128)
in := make(chan review, 128) in := make(chan review, 128)
go scrapeIndex(url, in, lod, scr) go scrapeIndex(url, in, scr)
go decodeReviews(in, out, dec) go decodeReviews(in, out, scr)
var results []review var results []review
for { for {

View File

@ -32,6 +32,7 @@ import (
) )
type tabelog struct { type tabelog struct {
scrapeCtx
} }
func (tabelog) index(doc *goquery.Document) (string, []string) { func (tabelog) index(doc *goquery.Document) (string, []string) {

View File

@ -32,6 +32,7 @@ import (
) )
type tripadvisor struct { type tripadvisor struct {
scrapeCtx
} }
func (tripadvisor) index(doc *goquery.Document) (string, []string) { func (tripadvisor) index(doc *goquery.Document) (string, []string) {