Scraping correctly again
This commit is contained in:
parent
c8be7c69ea
commit
6138b978d4
@ -46,7 +46,7 @@ type profiler interface {
|
|||||||
profile(doc *goquery.Document) *review
|
profile(doc *goquery.Document) *review
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeAbsUrl(base, ref string) (string, error) {
|
func makeAbsUrl(ref, base string) (string, error) {
|
||||||
b, err := url.Parse(base)
|
b, err := url.Parse(base)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@ -85,13 +85,12 @@ func scrapeReview(url string, out chan review, cache *webCache, group *sync.Wait
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to scrape review at %s (%v)", url, err)
|
log.Printf("failed to scrape review at %s (%v)", url, err)
|
||||||
} else if r := prof.profile(doc); r != nil {
|
} else if r := prof.profile(doc); r != nil {
|
||||||
|
r.url = url
|
||||||
out <- *r
|
out <- *r
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) {
|
func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) {
|
||||||
var group sync.WaitGroup
|
|
||||||
|
|
||||||
doc, err := cache.load(indexUrl)
|
doc, err := cache.load(indexUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to scrape index at %s (%v)", indexUrl, err)
|
log.Printf("failed to scrape index at %s (%v)", indexUrl, err)
|
||||||
@ -103,6 +102,7 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var group sync.WaitGroup
|
||||||
for _, reviewUrl := range reviewUrls {
|
for _, reviewUrl := range reviewUrls {
|
||||||
absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
|
absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -112,7 +112,6 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
|
|||||||
group.Add(1)
|
group.Add(1)
|
||||||
go scrapeReview(absUrl, out, cache, &group, prof)
|
go scrapeReview(absUrl, out, cache, &group, prof)
|
||||||
}
|
}
|
||||||
|
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
||||||
if nextIndexUrl == "" {
|
if nextIndexUrl == "" {
|
||||||
|
@ -51,9 +51,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
|
|||||||
func (tabelog) profile(doc *goquery.Document) *review {
|
func (tabelog) profile(doc *goquery.Document) *review {
|
||||||
var r review
|
var r review
|
||||||
|
|
||||||
r.url = doc.Url.String()
|
|
||||||
r.name = doc.Find("a.rd-header__rst-name-main").Text()
|
r.name = doc.Find("a.rd-header__rst-name-main").Text()
|
||||||
|
|
||||||
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
|
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
|
||||||
r.address = strings.TrimSpace(addresses.First().Text())
|
r.address = strings.TrimSpace(addresses.First().Text())
|
||||||
} else {
|
} else {
|
||||||
@ -61,6 +59,8 @@ func (tabelog) profile(doc *goquery.Document) *review {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
r.features = make(map[string]float64)
|
||||||
if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil {
|
if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user