1
This commit is contained in:
Alex Yatskov 2015-09-17 16:37:08 +09:00
parent bcfeae55b8
commit a51d82d1ea
4 changed files with 48 additions and 45 deletions

View File

@ -30,38 +30,21 @@ import (
"net/url"
"os"
"github.com/PuerkitoBio/goquery"
_ "github.com/mattn/go-sqlite3"
)
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
file, err := os.Open(urlsPath)
if err != nil {
return nil, err
}
defer file.Close()
type scrapeCtx struct {
gc *geoCache
wc *webCache
}
var reviews []review
var scanner = bufio.NewScanner(file)
func (s scrapeCtx) decode(address string) (float64, float64, error) {
return s.gc.decode(address)
}
for scanner.Scan() {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
switch parsed.Host {
case "tabelog.com":
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
case "www.tripadvisor.com":
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
default:
return nil, errors.New("unsupported review site")
}
}
}
return reviews, nil
func (s scrapeCtx) load(url string) (*goquery.Document, error) {
return s.wc.load(url)
}
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
@ -76,10 +59,34 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
return nil, err
}
reviews, err := scrapeDataUrls(urlsPath, wc, gc)
file, err := os.Open(urlsPath)
if err != nil {
return nil, err
}
defer file.Close()
var (
ctx = scrapeCtx{gc, wc}
reviews []review
)
for scanner := bufio.NewScanner(file); scanner.Scan(); {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
switch parsed.Host {
case "tabelog.com":
reviews = append(reviews, scrape(line, tabelog{scrapeCtx: ctx})...)
case "www.tripadvisor.com":
reviews = append(reviews, scrape(line, tripadvisor{scrapeCtx: ctx})...)
default:
return nil, errors.New("unsupported review site")
}
}
}
return reviews, nil
}

View File

@ -52,13 +52,7 @@ type review struct {
type scraper interface {
index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]feature, error)
}
type decoder interface {
decode(address string) (float64, float64, error)
}
type loader interface {
load(url string) (*goquery.Document, error)
}
@ -76,11 +70,11 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil
}
func decodeReviews(in chan review, out chan review, dec decoder) {
func decodeReviews(in chan review, out chan review, scr scraper) {
for {
if res, ok := <-in; ok {
var err error
res.latitude, res.longitude, err = dec.decode(res.address)
res.latitude, res.longitude, err = scr.decode(res.address)
if err == nil {
out <- res
} else {
@ -93,10 +87,10 @@ func decodeReviews(in chan review, out chan review, dec decoder) {
}
}
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) {
defer group.Done()
doc, err := lod.load(url)
doc, err := scr.load(url)
if err != nil {
log.Printf("failed to load review at %s (%v)", url, err)
return
@ -115,8 +109,8 @@ func scrapeReview(url string, out chan review, lod loader, scr scraper, group *s
url: url}
}
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
doc, err := lod.load(indexUrl)
func scrapeIndex(indexUrl string, out chan review, scr scraper) {
doc, err := scr.load(indexUrl)
if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err)
return
@ -135,7 +129,7 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
}
group.Add(1)
go scrapeReview(absUrl, out, lod, scr, &group)
go scrapeReview(absUrl, out, scr, &group)
}
group.Wait()
@ -147,16 +141,16 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
log.Fatal(err)
}
scrapeIndex(absUrl, out, lod, scr)
scrapeIndex(absUrl, out, scr)
}
}
func scrape(url string, lod loader, dec decoder, scr scraper) []review {
func scrape(url string, scr scraper) []review {
out := make(chan review, 128)
in := make(chan review, 128)
go scrapeIndex(url, in, lod, scr)
go decodeReviews(in, out, dec)
go scrapeIndex(url, in, scr)
go decodeReviews(in, out, scr)
var results []review
for {

View File

@ -32,6 +32,7 @@ import (
)
type tabelog struct {
scrapeCtx
}
func (tabelog) index(doc *goquery.Document) (string, []string) {

View File

@ -32,6 +32,7 @@ import (
)
type tripadvisor struct {
scrapeCtx
}
func (tripadvisor) index(doc *goquery.Document) (string, []string) {