WIP
This commit is contained in:
parent
bcfeae55b8
commit
a51d82d1ea
@ -30,38 +30,21 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
|
||||
file, err := os.Open(urlsPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
type scrapeCtx struct {
|
||||
gc *geoCache
|
||||
wc *webCache
|
||||
}
|
||||
|
||||
var reviews []review
|
||||
var scanner = bufio.NewScanner(file)
|
||||
func (s scrapeCtx) decode(address string) (float64, float64, error) {
|
||||
return s.gc.decode(address)
|
||||
}
|
||||
|
||||
for scanner.Scan() {
|
||||
if line := scanner.Text(); len(line) > 0 {
|
||||
parsed, err := url.Parse(line)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch parsed.Host {
|
||||
case "tabelog.com":
|
||||
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
|
||||
case "www.tripadvisor.com":
|
||||
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
|
||||
default:
|
||||
return nil, errors.New("unsupported review site")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return reviews, nil
|
||||
func (s scrapeCtx) load(url string) (*goquery.Document, error) {
|
||||
return s.wc.load(url)
|
||||
}
|
||||
|
||||
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
||||
@ -76,10 +59,34 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reviews, err := scrapeDataUrls(urlsPath, wc, gc)
|
||||
file, err := os.Open(urlsPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var (
|
||||
ctx = scrapeCtx{gc, wc}
|
||||
reviews []review
|
||||
)
|
||||
|
||||
for scanner := bufio.NewScanner(file); scanner.Scan(); {
|
||||
if line := scanner.Text(); len(line) > 0 {
|
||||
parsed, err := url.Parse(line)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch parsed.Host {
|
||||
case "tabelog.com":
|
||||
reviews = append(reviews, scrape(line, tabelog{scrapeCtx: ctx})...)
|
||||
case "www.tripadvisor.com":
|
||||
reviews = append(reviews, scrape(line, tripadvisor{scrapeCtx: ctx})...)
|
||||
default:
|
||||
return nil, errors.New("unsupported review site")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return reviews, nil
|
||||
}
|
||||
|
@ -52,13 +52,7 @@ type review struct {
|
||||
type scraper interface {
|
||||
index(doc *goquery.Document) (string, []string)
|
||||
review(doc *goquery.Document) (string, string, map[string]feature, error)
|
||||
}
|
||||
|
||||
type decoder interface {
|
||||
decode(address string) (float64, float64, error)
|
||||
}
|
||||
|
||||
type loader interface {
|
||||
load(url string) (*goquery.Document, error)
|
||||
}
|
||||
|
||||
@ -76,11 +70,11 @@ func makeAbsUrl(ref, base string) (string, error) {
|
||||
return b.ResolveReference(r).String(), nil
|
||||
}
|
||||
|
||||
func decodeReviews(in chan review, out chan review, dec decoder) {
|
||||
func decodeReviews(in chan review, out chan review, scr scraper) {
|
||||
for {
|
||||
if res, ok := <-in; ok {
|
||||
var err error
|
||||
res.latitude, res.longitude, err = dec.decode(res.address)
|
||||
res.latitude, res.longitude, err = scr.decode(res.address)
|
||||
if err == nil {
|
||||
out <- res
|
||||
} else {
|
||||
@ -93,10 +87,10 @@ func decodeReviews(in chan review, out chan review, dec decoder) {
|
||||
}
|
||||
}
|
||||
|
||||
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
|
||||
func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) {
|
||||
defer group.Done()
|
||||
|
||||
doc, err := lod.load(url)
|
||||
doc, err := scr.load(url)
|
||||
if err != nil {
|
||||
log.Printf("failed to load review at %s (%v)", url, err)
|
||||
return
|
||||
@ -115,8 +109,8 @@ func scrapeReview(url string, out chan review, lod loader, scr scraper, group *s
|
||||
url: url}
|
||||
}
|
||||
|
||||
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
||||
doc, err := lod.load(indexUrl)
|
||||
func scrapeIndex(indexUrl string, out chan review, scr scraper) {
|
||||
doc, err := scr.load(indexUrl)
|
||||
if err != nil {
|
||||
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
||||
return
|
||||
@ -135,7 +129,7 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
||||
}
|
||||
|
||||
group.Add(1)
|
||||
go scrapeReview(absUrl, out, lod, scr, &group)
|
||||
go scrapeReview(absUrl, out, scr, &group)
|
||||
}
|
||||
group.Wait()
|
||||
|
||||
@ -147,16 +141,16 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
scrapeIndex(absUrl, out, lod, scr)
|
||||
scrapeIndex(absUrl, out, scr)
|
||||
}
|
||||
}
|
||||
|
||||
func scrape(url string, lod loader, dec decoder, scr scraper) []review {
|
||||
func scrape(url string, scr scraper) []review {
|
||||
out := make(chan review, 128)
|
||||
in := make(chan review, 128)
|
||||
|
||||
go scrapeIndex(url, in, lod, scr)
|
||||
go decodeReviews(in, out, dec)
|
||||
go scrapeIndex(url, in, scr)
|
||||
go decodeReviews(in, out, scr)
|
||||
|
||||
var results []review
|
||||
for {
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
)
|
||||
|
||||
type tabelog struct {
|
||||
scrapeCtx
|
||||
}
|
||||
|
||||
func (tabelog) index(doc *goquery.Document) (string, []string) {
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
)
|
||||
|
||||
type tripadvisor struct {
|
||||
scrapeCtx
|
||||
}
|
||||
|
||||
func (tripadvisor) index(doc *goquery.Document) (string, []string) {
|
||||
|
Loading…
Reference in New Issue
Block a user