1

Throttling improvements

This commit is contained in:
Alex Yatskov 2015-08-14 17:27:09 +09:00
parent adf3e20004
commit 439196a87d
4 changed files with 5640 additions and 5598 deletions

32
cache/geocache.json vendored
View File

@ -271,6 +271,10 @@
"Latitude": 35.4088483,
"Longitude": 139.5953328
},
"1-15-8 Mizonokuchi Takatsu-ku Kawasaki Kanagawa": {
"Latitude": 35.6006815,
"Longitude": 139.6137396
},
"1-16-10 Fujigaoka Aoba-ku Yokohama Kanagawa": {
"Latitude": 35.5453749,
"Longitude": 139.5274005
@ -455,6 +459,10 @@
"Latitude": 35.5081315,
"Longitude": 139.6788956
},
"1-3-1 Minamisaiwai Nishi-ku Yokohama Kanagawa": {
"Latitude": 35.4673383,
"Longitude": 139.6220148
},
"1-3-12 Kugenumaishigami Fujisawa Kanagawa": {
"Latitude": 35.3366651,
"Longitude": 139.488675
@ -863,6 +871,10 @@
"Latitude": 35.3690655,
"Longitude": 139.5224912
},
"137 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4425663,
"Longitude": 139.6461168
},
"138 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4425774,
"Longitude": 139.6458973
@ -991,6 +1003,10 @@
"Latitude": 35.4433217,
"Longitude": 139.6463001
},
"191-4 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4433217,
"Longitude": 139.6463001
},
"192-15 Yamashitacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4433273,
"Longitude": 139.6467417
@ -1911,6 +1927,10 @@
"Latitude": 35.37976829999999,
"Longitude": 139.5787929
},
"3-27-3 Tokiwacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.445752,
"Longitude": 139.6370117
},
"3-28-13 Shimonagaya Konan-ku Yokohama Kanagawa": {
"Latitude": 35.4078842,
"Longitude": 139.5642913
@ -2019,6 +2039,10 @@
"Latitude": 35.4729078,
"Longitude": 139.4637948
},
"3-75 Suehirocho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4437467,
"Longitude": 139.6323621
},
"3-77 Miyagawacho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.446688,
"Longitude": 139.6276539
@ -2103,6 +2127,10 @@
"Latitude": 35.6108298,
"Longitude": 139.5921748
},
"4-122-3 Isezakicho Naka-ku Yokohama Kanagawa": {
"Latitude": 35.4414997,
"Longitude": 139.628118
},
"4-14-25 Chuorinkan Yamato Kanagawa": {
"Latitude": 35.5080078,
"Longitude": 139.4455886
@ -2671,6 +2699,10 @@
"Latitude": 35.4651997,
"Longitude": 139.6191458
},
"神奈川県横浜市西区南幸2-15-1": {
"Latitude": 35.4647525,
"Longitude": 139.6187736
},
"神奈川県横浜市都筑区茅ヶ崎中央30-17": {
"Latitude": 35.54467,
"Longitude": 139.5694442

File diff suppressed because it is too large Load Diff

View File

@ -67,9 +67,7 @@ func makeAbsUrl(base, ref string) string {
return b.ResolveReference(r).String()
}
func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
defer wg.Done()
func dumpReviews(filename string, in chan tabelogReview) {
var reviews []tabelogReview
for {
if review, ok := <-in; ok {
@ -89,12 +87,10 @@ func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
}
}
func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitGroup, gc *geoCache) {
defer wg.Done()
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
for {
if review, ok := <-in; ok {
log.Printf("decoding %s", review.Name)
log.Printf("decoding address for %s", review.Name)
coord, err := gc.decode(review.Address)
if err != nil {
@ -112,7 +108,8 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitG
}
}
func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
log.Printf("scraping review: %s", url)
defer wg.Done()
doc, err := wc.load(url)
@ -150,52 +147,56 @@ func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *we
out <- review
}
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
log.Printf("scraping index: %s", url)
defer wg.Done()
doc, err := wc.load(url)
if err != nil {
log.Fatal(err)
}
var wg sync.WaitGroup
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok {
wg.Add(1)
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
go scrapeReview(makeAbsUrl(url, href), out, wc, wg)
}
})
wg.Wait()
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
scrapeIndex(makeAbsUrl(url, href), out, wc)
wg.Add(1)
scrapeIndex(makeAbsUrl(url, href), out, wc, wg)
}
}
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
func scrapeReviews(url, webCacheDir string, out chan tabelogReview) {
wc, err := newWebCache(webCacheDir)
if err != nil {
log.Fatal(err)
}
var wg sync.WaitGroup
wg.Add(1)
scrapeIndex(url, out, wc, &wg)
wg.Wait()
close(out)
}
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
gc, err := newGeoCache(geoCacheFile)
if err != nil {
log.Fatal(err)
}
scrapeChan := make(chan tabelogReview)
decodeChan := make(chan tabelogReview)
scrapeChan := make(chan tabelogReview, 2000)
decodeChan := make(chan tabelogReview, 2000)
var wg sync.WaitGroup
wg.Add(2)
go decodeReviews(scrapeChan, decodeChan, &wg, gc)
go dumpReviews(resultFile, decodeChan, &wg)
scrapeIndex(url, scrapeChan, wc)
close(scrapeChan)
go decodeReviews(scrapeChan, decodeChan, gc)
scrapeReviews(url, webCacheDir, scrapeChan)
dumpReviews(resultFile, decodeChan)
if err := gc.save(); err != nil {
log.Fatal(err)
}
wg.Wait()
}

View File

@ -30,12 +30,14 @@ import (
"net/http"
"os"
"path"
"time"
"github.com/PuerkitoBio/goquery"
)
type webCache struct {
directory string
ticker *time.Ticker
}
func newWebCache(directory string) (*webCache, error) {
@ -43,7 +45,12 @@ func newWebCache(directory string) (*webCache, error) {
return nil, err
}
return &webCache{directory: directory}, nil
cache := &webCache{
directory: directory,
ticker: time.NewTicker(time.Millisecond * 100),
}
return cache, nil
}
func (c *webCache) urlToLocal(url string) string {
@ -60,6 +67,8 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
return goquery.NewDocumentFromReader(file)
}
<-c.ticker.C
res, err := http.Get(url)
if err != nil {
return nil, err