1

Throttling improvements

This commit is contained in:
Alex Yatskov 2015-08-14 17:27:09 +09:00
parent adf3e20004
commit 439196a87d
4 changed files with 5640 additions and 5598 deletions

32
cache/geocache.json vendored
View File

@ -271,6 +271,10 @@
"Latitude": 35.4088483, "Latitude": 35.4088483,
"Longitude": 139.5953328 "Longitude": 139.5953328
}, },
"1-15-8 Mizonokuchi Takatsu-ku Kawasaki Kanagawa": {
"Latitude": 35.6006815,
"Longitude": 139.6137396
},
"1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": { "1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": {
"Latitude": 35.5453749, "Latitude": 35.5453749,
"Longitude": 139.5274005 "Longitude": 139.5274005
@ -455,6 +459,10 @@
"Latitude": 35.5081315, "Latitude": 35.5081315,
"Longitude": 139.6788956 "Longitude": 139.6788956
}, },
"1-3-1 Minamisaiwai Nishi-ku Yokohama Kanagawa": {
"Latitude": 35.4673383,
"Longitude": 139.6220148
},
"1-3-12 Kugenumaishigami Fujisawa Kanagawa": { "1-3-12 Kugenumaishigami Fujisawa Kanagawa": {
"Latitude": 35.3366651, "Latitude": 35.3366651,
"Longitude": 139.488675 "Longitude": 139.488675
@ -863,6 +871,10 @@
"Latitude": 35.3690655, "Latitude": 35.3690655,
"Longitude": 139.5224912 "Longitude": 139.5224912
}, },
"137 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4425663,
"Longitude": 139.6461168
},
"138 Yamashitacho Naka-ku Yokohama Kanagawa": { "138 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4425774, "Latitude": 35.4425774,
"Longitude": 139.6458973 "Longitude": 139.6458973
@ -991,6 +1003,10 @@
"Latitude": 35.4433217, "Latitude": 35.4433217,
"Longitude": 139.6463001 "Longitude": 139.6463001
}, },
"191-4 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4433217,
"Longitude": 139.6463001
},
"192-15 Yamashitacho Naka-ku Yokohama Kanagawa": { "192-15 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4433273, "Latitude": 35.4433273,
"Longitude": 139.6467417 "Longitude": 139.6467417
@ -1911,6 +1927,10 @@
"Latitude": 35.37976829999999, "Latitude": 35.37976829999999,
"Longitude": 139.5787929 "Longitude": 139.5787929
}, },
"3-27-3 Tokiwacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.445752,
"Longitude": 139.6370117
},
"3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": { "3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": {
"Latitude": 35.4078842, "Latitude": 35.4078842,
"Longitude": 139.5642913 "Longitude": 139.5642913
@ -2019,6 +2039,10 @@
"Latitude": 35.4729078, "Latitude": 35.4729078,
"Longitude": 139.4637948 "Longitude": 139.4637948
}, },
"3-75 Suehirocho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4437467,
"Longitude": 139.6323621
},
"3-77 Miyagawacho Naka-ku Yokohama Kanagawa": { "3-77 Miyagawacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.446688, "Latitude": 35.446688,
"Longitude": 139.6276539 "Longitude": 139.6276539
@ -2103,6 +2127,10 @@
"Latitude": 35.6108298, "Latitude": 35.6108298,
"Longitude": 139.5921748 "Longitude": 139.5921748
}, },
"4-122-3 Isezakicho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4414997,
"Longitude": 139.628118
},
"4-14-25 Chuorinkan Yamato Kanagawa": { "4-14-25 Chuorinkan Yamato Kanagawa": {
"Latitude": 35.5080078, "Latitude": 35.5080078,
"Longitude": 139.4455886 "Longitude": 139.4455886
@ -2671,6 +2699,10 @@
"Latitude": 35.4651997, "Latitude": 35.4651997,
"Longitude": 139.6191458 "Longitude": 139.6191458
}, },
"神奈川県横浜市西区南幸2-15-1": {
"Latitude": 35.4647525,
"Longitude": 139.6187736
},
"神奈川県横浜市都筑区茅ヶ崎中央30-17": { "神奈川県横浜市都筑区茅ヶ崎中央30-17": {
"Latitude": 35.54467, "Latitude": 35.54467,
"Longitude": 139.5694442 "Longitude": 139.5694442

File diff suppressed because it is too large Load Diff

View File

@ -67,9 +67,7 @@ func makeAbsUrl(base, ref string) string {
return b.ResolveReference(r).String() return b.ResolveReference(r).String()
} }
func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) { func dumpReviews(filename string, in chan tabelogReview) {
defer wg.Done()
var reviews []tabelogReview var reviews []tabelogReview
for { for {
if review, ok := <-in; ok { if review, ok := <-in; ok {
@ -89,12 +87,10 @@ func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
} }
} }
func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitGroup, gc *geoCache) { func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
defer wg.Done()
for { for {
if review, ok := <-in; ok { if review, ok := <-in; ok {
log.Printf("decoding %s", review.Name) log.Printf("decoding address for %s", review.Name)
coord, err := gc.decode(review.Address) coord, err := gc.decode(review.Address)
if err != nil { if err != nil {
@ -112,7 +108,8 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitG
} }
} }
func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *webCache) { func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
log.Printf("scraping review: %s", url)
defer wg.Done() defer wg.Done()
doc, err := wc.load(url) doc, err := wc.load(url)
@ -150,52 +147,56 @@ func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *we
out <- review out <- review
} }
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) { func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
log.Printf("scraping index: %s", url)
defer wg.Done()
doc, err := wc.load(url) doc, err := wc.load(url)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
var wg sync.WaitGroup
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) { doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok { if href, ok := sel.Attr("href"); ok {
wg.Add(1) wg.Add(1)
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc) go scrapeReview(makeAbsUrl(url, href), out, wc, wg)
} }
}) })
wg.Wait()
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok { if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
scrapeIndex(makeAbsUrl(url, href), out, wc) wg.Add(1)
scrapeIndex(makeAbsUrl(url, href), out, wc, wg)
} }
} }
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) { func scrapeReviews(url, webCacheDir string, out chan tabelogReview) {
wc, err := newWebCache(webCacheDir) wc, err := newWebCache(webCacheDir)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
var wg sync.WaitGroup
wg.Add(1)
scrapeIndex(url, out, wc, &wg)
wg.Wait()
close(out)
}
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
gc, err := newGeoCache(geoCacheFile) gc, err := newGeoCache(geoCacheFile)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
scrapeChan := make(chan tabelogReview) scrapeChan := make(chan tabelogReview, 2000)
decodeChan := make(chan tabelogReview) decodeChan := make(chan tabelogReview, 2000)
var wg sync.WaitGroup go decodeReviews(scrapeChan, decodeChan, gc)
wg.Add(2) scrapeReviews(url, webCacheDir, scrapeChan)
dumpReviews(resultFile, decodeChan)
go decodeReviews(scrapeChan, decodeChan, &wg, gc)
go dumpReviews(resultFile, decodeChan, &wg)
scrapeIndex(url, scrapeChan, wc)
close(scrapeChan)
if err := gc.save(); err != nil { if err := gc.save(); err != nil {
log.Fatal(err) log.Fatal(err)
} }
wg.Wait()
} }

View File

@ -30,12 +30,14 @@ import (
"net/http" "net/http"
"os" "os"
"path" "path"
"time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type webCache struct { type webCache struct {
directory string directory string
ticker *time.Ticker
} }
func newWebCache(directory string) (*webCache, error) { func newWebCache(directory string) (*webCache, error) {
@ -43,7 +45,12 @@ func newWebCache(directory string) (*webCache, error) {
return nil, err return nil, err
} }
return &webCache{directory: directory}, nil cache := &webCache{
directory: directory,
ticker: time.NewTicker(time.Millisecond * 100),
}
return cache, nil
} }
func (c *webCache) urlToLocal(url string) string { func (c *webCache) urlToLocal(url string) string {
@ -60,6 +67,8 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
return goquery.NewDocumentFromReader(file) return goquery.NewDocumentFromReader(file)
} }
<-c.ticker.C
res, err := http.Get(url) res, err := http.Get(url)
if err != nil { if err != nil {
return nil, err return nil, err