Throttling improvements
This commit is contained in:
parent
adf3e20004
commit
439196a87d
32
cache/geocache.json
vendored
32
cache/geocache.json
vendored
@ -271,6 +271,10 @@
|
||||
"Latitude": 35.4088483,
|
||||
"Longitude": 139.5953328
|
||||
},
|
||||
"1-15-8 Mizonokuchi Takatsu-ku Kawasaki Kanagawa": {
|
||||
"Latitude": 35.6006815,
|
||||
"Longitude": 139.6137396
|
||||
},
|
||||
"1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.5453749,
|
||||
"Longitude": 139.5274005
|
||||
@ -455,6 +459,10 @@
|
||||
"Latitude": 35.5081315,
|
||||
"Longitude": 139.6788956
|
||||
},
|
||||
"1-3-1 Minamisaiwai Nishi-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4673383,
|
||||
"Longitude": 139.6220148
|
||||
},
|
||||
"1-3-12 Kugenumaishigami Fujisawa Kanagawa": {
|
||||
"Latitude": 35.3366651,
|
||||
"Longitude": 139.488675
|
||||
@ -863,6 +871,10 @@
|
||||
"Latitude": 35.3690655,
|
||||
"Longitude": 139.5224912
|
||||
},
|
||||
"137 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4425663,
|
||||
"Longitude": 139.6461168
|
||||
},
|
||||
"138 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4425774,
|
||||
"Longitude": 139.6458973
|
||||
@ -991,6 +1003,10 @@
|
||||
"Latitude": 35.4433217,
|
||||
"Longitude": 139.6463001
|
||||
},
|
||||
"191-4 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4433217,
|
||||
"Longitude": 139.6463001
|
||||
},
|
||||
"192-15 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4433273,
|
||||
"Longitude": 139.6467417
|
||||
@ -1911,6 +1927,10 @@
|
||||
"Latitude": 35.37976829999999,
|
||||
"Longitude": 139.5787929
|
||||
},
|
||||
"3-27-3 Tokiwacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.445752,
|
||||
"Longitude": 139.6370117
|
||||
},
|
||||
"3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4078842,
|
||||
"Longitude": 139.5642913
|
||||
@ -2019,6 +2039,10 @@
|
||||
"Latitude": 35.4729078,
|
||||
"Longitude": 139.4637948
|
||||
},
|
||||
"3-75 Suehirocho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4437467,
|
||||
"Longitude": 139.6323621
|
||||
},
|
||||
"3-77 Miyagawacho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.446688,
|
||||
"Longitude": 139.6276539
|
||||
@ -2103,6 +2127,10 @@
|
||||
"Latitude": 35.6108298,
|
||||
"Longitude": 139.5921748
|
||||
},
|
||||
"4-122-3 Isezakicho Naka-ku Yokohama Kanagawa": {
|
||||
"Latitude": 35.4414997,
|
||||
"Longitude": 139.628118
|
||||
},
|
||||
"4-14-25 Chuorinkan Yamato Kanagawa": {
|
||||
"Latitude": 35.5080078,
|
||||
"Longitude": 139.4455886
|
||||
@ -2671,6 +2699,10 @@
|
||||
"Latitude": 35.4651997,
|
||||
"Longitude": 139.6191458
|
||||
},
|
||||
"神奈川県横浜市西区南幸2-15-1": {
|
||||
"Latitude": 35.4647525,
|
||||
"Longitude": 139.6187736
|
||||
},
|
||||
"神奈川県横浜市都筑区茅ヶ崎中央30-17": {
|
||||
"Latitude": 35.54467,
|
||||
"Longitude": 139.5694442
|
||||
|
11142
data/tabelog.json
11142
data/tabelog.json
File diff suppressed because it is too large
Load Diff
53
tabelog.go
53
tabelog.go
@ -67,9 +67,7 @@ func makeAbsUrl(base, ref string) string {
|
||||
return b.ResolveReference(r).String()
|
||||
}
|
||||
|
||||
func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
func dumpReviews(filename string, in chan tabelogReview) {
|
||||
var reviews []tabelogReview
|
||||
for {
|
||||
if review, ok := <-in; ok {
|
||||
@ -89,12 +87,10 @@ func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
|
||||
}
|
||||
}
|
||||
|
||||
func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitGroup, gc *geoCache) {
|
||||
defer wg.Done()
|
||||
|
||||
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
|
||||
for {
|
||||
if review, ok := <-in; ok {
|
||||
log.Printf("decoding %s", review.Name)
|
||||
log.Printf("decoding address for %s", review.Name)
|
||||
|
||||
coord, err := gc.decode(review.Address)
|
||||
if err != nil {
|
||||
@ -112,7 +108,8 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitG
|
||||
}
|
||||
}
|
||||
|
||||
func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
|
||||
func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||
log.Printf("scraping review: %s", url)
|
||||
defer wg.Done()
|
||||
|
||||
doc, err := wc.load(url)
|
||||
@ -150,52 +147,56 @@ func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *we
|
||||
out <- review
|
||||
}
|
||||
|
||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||
log.Printf("scraping index: %s", url)
|
||||
defer wg.Done()
|
||||
|
||||
doc, err := wc.load(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
||||
if href, ok := sel.Attr("href"); ok {
|
||||
wg.Add(1)
|
||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
|
||||
go scrapeReview(makeAbsUrl(url, href), out, wc, wg)
|
||||
}
|
||||
})
|
||||
wg.Wait()
|
||||
|
||||
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||
scrapeIndex(makeAbsUrl(url, href), out, wc)
|
||||
wg.Add(1)
|
||||
scrapeIndex(makeAbsUrl(url, href), out, wc, wg)
|
||||
}
|
||||
}
|
||||
|
||||
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
||||
func scrapeReviews(url, webCacheDir string, out chan tabelogReview) {
|
||||
wc, err := newWebCache(webCacheDir)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
scrapeIndex(url, out, wc, &wg)
|
||||
wg.Wait()
|
||||
|
||||
close(out)
|
||||
}
|
||||
|
||||
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
||||
gc, err := newGeoCache(geoCacheFile)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
scrapeChan := make(chan tabelogReview)
|
||||
decodeChan := make(chan tabelogReview)
|
||||
scrapeChan := make(chan tabelogReview, 2000)
|
||||
decodeChan := make(chan tabelogReview, 2000)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
go decodeReviews(scrapeChan, decodeChan, &wg, gc)
|
||||
go dumpReviews(resultFile, decodeChan, &wg)
|
||||
|
||||
scrapeIndex(url, scrapeChan, wc)
|
||||
close(scrapeChan)
|
||||
go decodeReviews(scrapeChan, decodeChan, gc)
|
||||
scrapeReviews(url, webCacheDir, scrapeChan)
|
||||
dumpReviews(resultFile, decodeChan)
|
||||
|
||||
if err := gc.save(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
11
webcache.go
11
webcache.go
@ -30,12 +30,14 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type webCache struct {
|
||||
directory string
|
||||
ticker *time.Ticker
|
||||
}
|
||||
|
||||
func newWebCache(directory string) (*webCache, error) {
|
||||
@ -43,7 +45,12 @@ func newWebCache(directory string) (*webCache, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &webCache{directory: directory}, nil
|
||||
cache := &webCache{
|
||||
directory: directory,
|
||||
ticker: time.NewTicker(time.Millisecond * 100),
|
||||
}
|
||||
|
||||
return cache, nil
|
||||
}
|
||||
|
||||
func (c *webCache) urlToLocal(url string) string {
|
||||
@ -60,6 +67,8 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
|
||||
return goquery.NewDocumentFromReader(file)
|
||||
}
|
||||
|
||||
<-c.ticker.C
|
||||
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
Loading…
Reference in New Issue
Block a user