WIP
This commit is contained in:
parent
bcfeae55b8
commit
a51d82d1ea
@ -30,38 +30,21 @@ import (
|
|||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
_ "github.com/mattn/go-sqlite3"
|
_ "github.com/mattn/go-sqlite3"
|
||||||
)
|
)
|
||||||
|
|
||||||
func scrapeDataUrls(urlsPath string, wc *webCache, gc *geoCache) ([]review, error) {
|
type scrapeCtx struct {
|
||||||
file, err := os.Open(urlsPath)
|
gc *geoCache
|
||||||
if err != nil {
|
wc *webCache
|
||||||
return nil, err
|
}
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
var reviews []review
|
func (s scrapeCtx) decode(address string) (float64, float64, error) {
|
||||||
var scanner = bufio.NewScanner(file)
|
return s.gc.decode(address)
|
||||||
|
}
|
||||||
|
|
||||||
for scanner.Scan() {
|
func (s scrapeCtx) load(url string) (*goquery.Document, error) {
|
||||||
if line := scanner.Text(); len(line) > 0 {
|
return s.wc.load(url)
|
||||||
parsed, err := url.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
switch parsed.Host {
|
|
||||||
case "tabelog.com":
|
|
||||||
reviews = append(reviews, scrape(line, wc, gc, tabelog{})...)
|
|
||||||
case "www.tripadvisor.com":
|
|
||||||
reviews = append(reviews, scrape(line, wc, gc, tripadvisor{})...)
|
|
||||||
default:
|
|
||||||
return nil, errors.New("unsupported review site")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return reviews, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
||||||
@ -76,10 +59,34 @@ func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
reviews, err := scrapeDataUrls(urlsPath, wc, gc)
|
file, err := os.Open(urlsPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
var (
|
||||||
|
ctx = scrapeCtx{gc, wc}
|
||||||
|
reviews []review
|
||||||
|
)
|
||||||
|
|
||||||
|
for scanner := bufio.NewScanner(file); scanner.Scan(); {
|
||||||
|
if line := scanner.Text(); len(line) > 0 {
|
||||||
|
parsed, err := url.Parse(line)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch parsed.Host {
|
||||||
|
case "tabelog.com":
|
||||||
|
reviews = append(reviews, scrape(line, tabelog{scrapeCtx: ctx})...)
|
||||||
|
case "www.tripadvisor.com":
|
||||||
|
reviews = append(reviews, scrape(line, tripadvisor{scrapeCtx: ctx})...)
|
||||||
|
default:
|
||||||
|
return nil, errors.New("unsupported review site")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return reviews, nil
|
return reviews, nil
|
||||||
}
|
}
|
||||||
|
@ -52,13 +52,7 @@ type review struct {
|
|||||||
type scraper interface {
|
type scraper interface {
|
||||||
index(doc *goquery.Document) (string, []string)
|
index(doc *goquery.Document) (string, []string)
|
||||||
review(doc *goquery.Document) (string, string, map[string]feature, error)
|
review(doc *goquery.Document) (string, string, map[string]feature, error)
|
||||||
}
|
|
||||||
|
|
||||||
type decoder interface {
|
|
||||||
decode(address string) (float64, float64, error)
|
decode(address string) (float64, float64, error)
|
||||||
}
|
|
||||||
|
|
||||||
type loader interface {
|
|
||||||
load(url string) (*goquery.Document, error)
|
load(url string) (*goquery.Document, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,11 +70,11 @@ func makeAbsUrl(ref, base string) (string, error) {
|
|||||||
return b.ResolveReference(r).String(), nil
|
return b.ResolveReference(r).String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func decodeReviews(in chan review, out chan review, dec decoder) {
|
func decodeReviews(in chan review, out chan review, scr scraper) {
|
||||||
for {
|
for {
|
||||||
if res, ok := <-in; ok {
|
if res, ok := <-in; ok {
|
||||||
var err error
|
var err error
|
||||||
res.latitude, res.longitude, err = dec.decode(res.address)
|
res.latitude, res.longitude, err = scr.decode(res.address)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
out <- res
|
out <- res
|
||||||
} else {
|
} else {
|
||||||
@ -93,10 +87,10 @@ func decodeReviews(in chan review, out chan review, dec decoder) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, out chan review, lod loader, scr scraper, group *sync.WaitGroup) {
|
func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) {
|
||||||
defer group.Done()
|
defer group.Done()
|
||||||
|
|
||||||
doc, err := lod.load(url)
|
doc, err := scr.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to load review at %s (%v)", url, err)
|
log.Printf("failed to load review at %s (%v)", url, err)
|
||||||
return
|
return
|
||||||
@ -115,8 +109,8 @@ func scrapeReview(url string, out chan review, lod loader, scr scraper, group *s
|
|||||||
url: url}
|
url: url}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
func scrapeIndex(indexUrl string, out chan review, scr scraper) {
|
||||||
doc, err := lod.load(indexUrl)
|
doc, err := scr.load(indexUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
log.Printf("failed to load index at %s (%v)", indexUrl, err)
|
||||||
return
|
return
|
||||||
@ -135,7 +129,7 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
group.Add(1)
|
group.Add(1)
|
||||||
go scrapeReview(absUrl, out, lod, scr, &group)
|
go scrapeReview(absUrl, out, scr, &group)
|
||||||
}
|
}
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
||||||
@ -147,16 +141,16 @@ func scrapeIndex(indexUrl string, out chan review, lod loader, scr scraper) {
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
scrapeIndex(absUrl, out, lod, scr)
|
scrapeIndex(absUrl, out, scr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrape(url string, lod loader, dec decoder, scr scraper) []review {
|
func scrape(url string, scr scraper) []review {
|
||||||
out := make(chan review, 128)
|
out := make(chan review, 128)
|
||||||
in := make(chan review, 128)
|
in := make(chan review, 128)
|
||||||
|
|
||||||
go scrapeIndex(url, in, lod, scr)
|
go scrapeIndex(url, in, scr)
|
||||||
go decodeReviews(in, out, dec)
|
go decodeReviews(in, out, scr)
|
||||||
|
|
||||||
var results []review
|
var results []review
|
||||||
for {
|
for {
|
||||||
|
@ -32,6 +32,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type tabelog struct {
|
type tabelog struct {
|
||||||
|
scrapeCtx
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tabelog) index(doc *goquery.Document) (string, []string) {
|
func (tabelog) index(doc *goquery.Document) (string, []string) {
|
||||||
|
@ -32,6 +32,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type tripadvisor struct {
|
type tripadvisor struct {
|
||||||
|
scrapeCtx
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tripadvisor) index(doc *goquery.Document) (string, []string) {
|
func (tripadvisor) index(doc *goquery.Document) (string, []string) {
|
||||||
|
Loading…
Reference in New Issue
Block a user