Throttling improvements
This commit is contained in:
parent
adf3e20004
commit
439196a87d
32
cache/geocache.json
vendored
32
cache/geocache.json
vendored
@ -271,6 +271,10 @@
|
|||||||
"Latitude": 35.4088483,
|
"Latitude": 35.4088483,
|
||||||
"Longitude": 139.5953328
|
"Longitude": 139.5953328
|
||||||
},
|
},
|
||||||
|
"1-15-8 Mizonokuchi Takatsu-ku Kawasaki Kanagawa": {
|
||||||
|
"Latitude": 35.6006815,
|
||||||
|
"Longitude": 139.6137396
|
||||||
|
},
|
||||||
"1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": {
|
"1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": {
|
||||||
"Latitude": 35.5453749,
|
"Latitude": 35.5453749,
|
||||||
"Longitude": 139.5274005
|
"Longitude": 139.5274005
|
||||||
@ -455,6 +459,10 @@
|
|||||||
"Latitude": 35.5081315,
|
"Latitude": 35.5081315,
|
||||||
"Longitude": 139.6788956
|
"Longitude": 139.6788956
|
||||||
},
|
},
|
||||||
|
"1-3-1 Minamisaiwai Nishi-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.4673383,
|
||||||
|
"Longitude": 139.6220148
|
||||||
|
},
|
||||||
"1-3-12 Kugenumaishigami Fujisawa Kanagawa": {
|
"1-3-12 Kugenumaishigami Fujisawa Kanagawa": {
|
||||||
"Latitude": 35.3366651,
|
"Latitude": 35.3366651,
|
||||||
"Longitude": 139.488675
|
"Longitude": 139.488675
|
||||||
@ -863,6 +871,10 @@
|
|||||||
"Latitude": 35.3690655,
|
"Latitude": 35.3690655,
|
||||||
"Longitude": 139.5224912
|
"Longitude": 139.5224912
|
||||||
},
|
},
|
||||||
|
"137 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.4425663,
|
||||||
|
"Longitude": 139.6461168
|
||||||
|
},
|
||||||
"138 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
"138 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||||
"Latitude": 35.4425774,
|
"Latitude": 35.4425774,
|
||||||
"Longitude": 139.6458973
|
"Longitude": 139.6458973
|
||||||
@ -991,6 +1003,10 @@
|
|||||||
"Latitude": 35.4433217,
|
"Latitude": 35.4433217,
|
||||||
"Longitude": 139.6463001
|
"Longitude": 139.6463001
|
||||||
},
|
},
|
||||||
|
"191-4 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.4433217,
|
||||||
|
"Longitude": 139.6463001
|
||||||
|
},
|
||||||
"192-15 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
"192-15 Yamashitacho Naka-ku Yokohama Kanagawa": {
|
||||||
"Latitude": 35.4433273,
|
"Latitude": 35.4433273,
|
||||||
"Longitude": 139.6467417
|
"Longitude": 139.6467417
|
||||||
@ -1911,6 +1927,10 @@
|
|||||||
"Latitude": 35.37976829999999,
|
"Latitude": 35.37976829999999,
|
||||||
"Longitude": 139.5787929
|
"Longitude": 139.5787929
|
||||||
},
|
},
|
||||||
|
"3-27-3 Tokiwacho Naka-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.445752,
|
||||||
|
"Longitude": 139.6370117
|
||||||
|
},
|
||||||
"3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": {
|
"3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": {
|
||||||
"Latitude": 35.4078842,
|
"Latitude": 35.4078842,
|
||||||
"Longitude": 139.5642913
|
"Longitude": 139.5642913
|
||||||
@ -2019,6 +2039,10 @@
|
|||||||
"Latitude": 35.4729078,
|
"Latitude": 35.4729078,
|
||||||
"Longitude": 139.4637948
|
"Longitude": 139.4637948
|
||||||
},
|
},
|
||||||
|
"3-75 Suehirocho Naka-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.4437467,
|
||||||
|
"Longitude": 139.6323621
|
||||||
|
},
|
||||||
"3-77 Miyagawacho Naka-ku Yokohama Kanagawa": {
|
"3-77 Miyagawacho Naka-ku Yokohama Kanagawa": {
|
||||||
"Latitude": 35.446688,
|
"Latitude": 35.446688,
|
||||||
"Longitude": 139.6276539
|
"Longitude": 139.6276539
|
||||||
@ -2103,6 +2127,10 @@
|
|||||||
"Latitude": 35.6108298,
|
"Latitude": 35.6108298,
|
||||||
"Longitude": 139.5921748
|
"Longitude": 139.5921748
|
||||||
},
|
},
|
||||||
|
"4-122-3 Isezakicho Naka-ku Yokohama Kanagawa": {
|
||||||
|
"Latitude": 35.4414997,
|
||||||
|
"Longitude": 139.628118
|
||||||
|
},
|
||||||
"4-14-25 Chuorinkan Yamato Kanagawa": {
|
"4-14-25 Chuorinkan Yamato Kanagawa": {
|
||||||
"Latitude": 35.5080078,
|
"Latitude": 35.5080078,
|
||||||
"Longitude": 139.4455886
|
"Longitude": 139.4455886
|
||||||
@ -2671,6 +2699,10 @@
|
|||||||
"Latitude": 35.4651997,
|
"Latitude": 35.4651997,
|
||||||
"Longitude": 139.6191458
|
"Longitude": 139.6191458
|
||||||
},
|
},
|
||||||
|
"神奈川県横浜市西区南幸2-15-1": {
|
||||||
|
"Latitude": 35.4647525,
|
||||||
|
"Longitude": 139.6187736
|
||||||
|
},
|
||||||
"神奈川県横浜市都筑区茅ヶ崎中央30-17": {
|
"神奈川県横浜市都筑区茅ヶ崎中央30-17": {
|
||||||
"Latitude": 35.54467,
|
"Latitude": 35.54467,
|
||||||
"Longitude": 139.5694442
|
"Longitude": 139.5694442
|
||||||
|
11142
data/tabelog.json
11142
data/tabelog.json
File diff suppressed because it is too large
Load Diff
53
tabelog.go
53
tabelog.go
@ -67,9 +67,7 @@ func makeAbsUrl(base, ref string) string {
|
|||||||
return b.ResolveReference(r).String()
|
return b.ResolveReference(r).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
|
func dumpReviews(filename string, in chan tabelogReview) {
|
||||||
defer wg.Done()
|
|
||||||
|
|
||||||
var reviews []tabelogReview
|
var reviews []tabelogReview
|
||||||
for {
|
for {
|
||||||
if review, ok := <-in; ok {
|
if review, ok := <-in; ok {
|
||||||
@ -89,12 +87,10 @@ func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitGroup, gc *geoCache) {
|
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
|
||||||
defer wg.Done()
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
if review, ok := <-in; ok {
|
if review, ok := <-in; ok {
|
||||||
log.Printf("decoding %s", review.Name)
|
log.Printf("decoding address for %s", review.Name)
|
||||||
|
|
||||||
coord, err := gc.decode(review.Address)
|
coord, err := gc.decode(review.Address)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -112,7 +108,8 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitG
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
|
func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||||
|
log.Printf("scraping review: %s", url)
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
doc, err := wc.load(url)
|
doc, err := wc.load(url)
|
||||||
@ -150,52 +147,56 @@ func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *we
|
|||||||
out <- review
|
out <- review
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||||
|
log.Printf("scraping index: %s", url)
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
doc, err := wc.load(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
||||||
if href, ok := sel.Attr("href"); ok {
|
if href, ok := sel.Attr("href"); ok {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
|
go scrapeReview(makeAbsUrl(url, href), out, wc, wg)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||||
scrapeIndex(makeAbsUrl(url, href), out, wc)
|
wg.Add(1)
|
||||||
|
scrapeIndex(makeAbsUrl(url, href), out, wc, wg)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
func scrapeReviews(url, webCacheDir string, out chan tabelogReview) {
|
||||||
wc, err := newWebCache(webCacheDir)
|
wc, err := newWebCache(webCacheDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(1)
|
||||||
|
scrapeIndex(url, out, wc, &wg)
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
close(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
||||||
gc, err := newGeoCache(geoCacheFile)
|
gc, err := newGeoCache(geoCacheFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
scrapeChan := make(chan tabelogReview)
|
scrapeChan := make(chan tabelogReview, 2000)
|
||||||
decodeChan := make(chan tabelogReview)
|
decodeChan := make(chan tabelogReview, 2000)
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
go decodeReviews(scrapeChan, decodeChan, gc)
|
||||||
wg.Add(2)
|
scrapeReviews(url, webCacheDir, scrapeChan)
|
||||||
|
dumpReviews(resultFile, decodeChan)
|
||||||
go decodeReviews(scrapeChan, decodeChan, &wg, gc)
|
|
||||||
go dumpReviews(resultFile, decodeChan, &wg)
|
|
||||||
|
|
||||||
scrapeIndex(url, scrapeChan, wc)
|
|
||||||
close(scrapeChan)
|
|
||||||
|
|
||||||
if err := gc.save(); err != nil {
|
if err := gc.save(); err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
wg.Wait()
|
|
||||||
}
|
}
|
||||||
|
11
webcache.go
11
webcache.go
@ -30,12 +30,14 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
type webCache struct {
|
type webCache struct {
|
||||||
directory string
|
directory string
|
||||||
|
ticker *time.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
func newWebCache(directory string) (*webCache, error) {
|
func newWebCache(directory string) (*webCache, error) {
|
||||||
@ -43,7 +45,12 @@ func newWebCache(directory string) (*webCache, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return &webCache{directory: directory}, nil
|
cache := &webCache{
|
||||||
|
directory: directory,
|
||||||
|
ticker: time.NewTicker(time.Millisecond * 100),
|
||||||
|
}
|
||||||
|
|
||||||
|
return cache, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *webCache) urlToLocal(url string) string {
|
func (c *webCache) urlToLocal(url string) string {
|
||||||
@ -60,6 +67,8 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
|
|||||||
return goquery.NewDocumentFromReader(file)
|
return goquery.NewDocumentFromReader(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
<-c.ticker.C
|
||||||
|
|
||||||
res, err := http.Get(url)
|
res, err := http.Get(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
Loading…
Reference in New Issue
Block a user