1

Scraping correctly again

This commit is contained in:
Alex Yatskov 2015-08-16 19:30:45 +09:00
parent c8be7c69ea
commit 6138b978d4
2 changed files with 5 additions and 6 deletions

View File

@ -46,7 +46,7 @@ type profiler interface {
profile(doc *goquery.Document) *review profile(doc *goquery.Document) *review
} }
func makeAbsUrl(base, ref string) (string, error) { func makeAbsUrl(ref, base string) (string, error) {
b, err := url.Parse(base) b, err := url.Parse(base)
if err != nil { if err != nil {
return "", err return "", err
@ -85,13 +85,12 @@ func scrapeReview(url string, out chan review, cache *webCache, group *sync.Wait
if err != nil { if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err) log.Printf("failed to scrape review at %s (%v)", url, err)
} else if r := prof.profile(doc); r != nil { } else if r := prof.profile(doc); r != nil {
r.url = url
out <- *r out <- *r
} }
} }
func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) { func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profiler) {
var group sync.WaitGroup
doc, err := cache.load(indexUrl) doc, err := cache.load(indexUrl)
if err != nil { if err != nil {
log.Printf("failed to scrape index at %s (%v)", indexUrl, err) log.Printf("failed to scrape index at %s (%v)", indexUrl, err)
@ -103,6 +102,7 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
log.Fatal(err) log.Fatal(err)
} }
var group sync.WaitGroup
for _, reviewUrl := range reviewUrls { for _, reviewUrl := range reviewUrls {
absUrl, err := makeAbsUrl(reviewUrl, indexUrl) absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
if err != nil { if err != nil {
@ -112,7 +112,6 @@ func scrapeIndex(indexUrl string, out chan review, cache *webCache, prof profile
group.Add(1) group.Add(1)
go scrapeReview(absUrl, out, cache, &group, prof) go scrapeReview(absUrl, out, cache, &group, prof)
} }
group.Wait() group.Wait()
if nextIndexUrl == "" { if nextIndexUrl == "" {

View File

@ -51,9 +51,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
func (tabelog) profile(doc *goquery.Document) *review { func (tabelog) profile(doc *goquery.Document) *review {
var r review var r review
r.url = doc.Url.String()
r.name = doc.Find("a.rd-header__rst-name-main").Text() r.name = doc.Find("a.rd-header__rst-name-main").Text()
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
r.address = strings.TrimSpace(addresses.First().Text()) r.address = strings.TrimSpace(addresses.First().Text())
} else { } else {
@ -61,6 +59,8 @@ func (tabelog) profile(doc *goquery.Document) *review {
} }
var err error var err error
r.features = make(map[string]float64)
if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil { if r.features["dishes"], err = strconv.ParseFloat(doc.Find("#js-rating-detail > dd:nth-child(2)").Text(), 8); err != nil {
return nil return nil
} }