Updating cache
This commit is contained in:
parent
06bc22ebe4
commit
08bd475950
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1 @@
|
||||
search-scrape
|
||||
webcache
|
||||
|
1
cache/.gitignore
vendored
Normal file
1
cache/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
webcache
|
1
cache/geocache.json
vendored
Normal file
1
cache/geocache.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"1-1-3 Sagamihara Chuo-ku Sagamihara Kanagawa":{"Latitude":35.5811327,"Longitude":139.3709098},"1-6-1 Kamiookanishi Konan-ku Yokohama Kanagawa":{"Latitude":35.4072739,"Longitude":139.5961112},"16-1 Totsukacho Totsuka-ku Yokohama Kanagawa":{"Latitude":35.4015872,"Longitude":139.5332269},"2-14-1 Minamisaiwai Nishi-ku Yokohama Kanagawa":{"Latitude":35.4640304,"Longitude":139.6177293},"200 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4431273,"Longitude":139.6434308},"3-43-4 Wakabacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4412748,"Longitude":139.6275875},"4-14-25 Chuorinkan Yamato Kanagawa":{"Latitude":35.5080078,"Longitude":139.4455886},"4-2-1 Kamikodanaka Nakahara-ku Kawasaki Kanagawa":{"Latitude":35.5813758,"Longitude":139.6409824},"61-1 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4438856,"Longitude":139.6488554},"Takashima Nishi-ku Yokohama Kanagawa":{"Latitude":35.4622889,"Longitude":139.6222899}}
|
8480
data/tabelog.json
8480
data/tabelog.json
File diff suppressed because it is too large
Load Diff
97
geocache.go
Normal file
97
geocache.go
Normal file
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
|
||||
* Author: Alex Yatskov <alex@foosoft.net>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
* this software and associated documentation files (the "Software"), to deal in
|
||||
* the Software without restriction, including without limitation the rights to
|
||||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
* the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/kellydunn/golang-geo"
|
||||
)
|
||||
|
||||
type geoCoord struct {
|
||||
Latitude float64
|
||||
Longitude float64
|
||||
}
|
||||
|
||||
type geoCache struct {
|
||||
cacheFile string
|
||||
addressCache map[string]geoCoord
|
||||
geocoder geo.GoogleGeocoder
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
func newGeoCache(cacheFile string) (*geoCache, error) {
|
||||
cache := &geoCache{
|
||||
cacheFile: cacheFile,
|
||||
addressCache: make(map[string]geoCoord)}
|
||||
|
||||
if err := cache.load(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return cache, nil
|
||||
}
|
||||
|
||||
func (c *geoCache) load() error {
|
||||
file, err := os.Open(c.cacheFile)
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
return json.NewDecoder(file).Decode(&c.addressCache)
|
||||
}
|
||||
|
||||
func (c *geoCache) save() error {
|
||||
js, err := json.MarshalIndent(c.addressCache, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return ioutil.WriteFile(c.cacheFile, js, 0644)
|
||||
}
|
||||
|
||||
func (c *geoCache) decode(address string) (geoCoord, error) {
|
||||
if coord, ok := c.addressCache[address]; ok {
|
||||
return coord, nil
|
||||
}
|
||||
|
||||
point, err := c.geocoder.Geocode(address)
|
||||
if err != nil {
|
||||
return geoCoord{}, err
|
||||
}
|
||||
|
||||
coord := geoCoord{point.Lat(), point.Lng()}
|
||||
|
||||
c.mutex.Lock()
|
||||
c.addressCache[address] = coord
|
||||
c.mutex.Unlock()
|
||||
|
||||
return coord, nil
|
||||
}
|
@ -23,5 +23,5 @@
|
||||
package main
|
||||
|
||||
func main() {
|
||||
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "webcache")
|
||||
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "cache/webcache", "cache/geocache.json")
|
||||
}
|
||||
|
48
tabelog.go
48
tabelog.go
@ -43,13 +43,16 @@ type tabelogParams struct {
|
||||
type tabelogReview struct {
|
||||
Name string
|
||||
Address string
|
||||
Url string
|
||||
|
||||
Dishes float64
|
||||
Service float64
|
||||
Atmosphere float64
|
||||
Cost float64
|
||||
Drinks float64
|
||||
Url string
|
||||
Raw string
|
||||
|
||||
Latitude float64
|
||||
Longitude float64
|
||||
}
|
||||
|
||||
func makeAbsUrl(base, ref string) string {
|
||||
@ -91,10 +94,10 @@ func dumpReviews(filename string, rc chan tabelogReview, wg *sync.WaitGroup) {
|
||||
}
|
||||
}
|
||||
|
||||
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
|
||||
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache, gc *geoCache) {
|
||||
defer wg.Done()
|
||||
|
||||
doc, err := wc.fetchUrl(url)
|
||||
doc, err := wc.load(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@ -126,11 +129,19 @@ func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *web
|
||||
return
|
||||
}
|
||||
|
||||
coord, err := gc.decode(review.Address)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
review.Latitude = coord.Latitude
|
||||
review.Longitude = coord.Longitude
|
||||
|
||||
rc <- review
|
||||
}
|
||||
|
||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
||||
doc, err := wc.fetchUrl(url)
|
||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, gc *geoCache) {
|
||||
doc, err := wc.load(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@ -139,18 +150,23 @@ func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
||||
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
||||
if href, ok := sel.Attr("href"); ok {
|
||||
wg.Add(1)
|
||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
|
||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc, gc)
|
||||
}
|
||||
})
|
||||
wg.Wait()
|
||||
|
||||
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||
scrapeIndex(makeAbsUrl(url, href), out, wc)
|
||||
}
|
||||
// if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||
// scrapeIndex(makeAbsUrl(url, href), out, wc, gc)
|
||||
// }
|
||||
}
|
||||
|
||||
func scrapeTabelog(url, jsonFile, cacheDir string) {
|
||||
wc, err := newWebCache(cacheDir)
|
||||
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
||||
wc, err := newWebCache(webCacheDir)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
gc, err := newGeoCache(geoCacheFile)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@ -158,9 +174,13 @@ func scrapeTabelog(url, jsonFile, cacheDir string) {
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
rc := make(chan tabelogReview)
|
||||
go dumpReviews(jsonFile, rc, &wg)
|
||||
go dumpReviews(resultFile, rc, &wg)
|
||||
|
||||
scrapeIndex(url, rc, wc)
|
||||
scrapeIndex(url, rc, wc, gc)
|
||||
|
||||
if err := gc.save(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
close(rc)
|
||||
wg.Wait()
|
||||
|
@ -52,7 +52,7 @@ func (c *webCache) urlToLocal(url string) string {
|
||||
return path.Join(c.cacheDir, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
||||
}
|
||||
|
||||
func (c *webCache) fetchUrl(url string) (*goquery.Document, error) {
|
||||
func (c *webCache) load(url string) (*goquery.Document, error) {
|
||||
localPath := c.urlToLocal(url)
|
||||
|
||||
if file, err := os.Open(localPath); err == nil {
|
||||
|
Loading…
Reference in New Issue
Block a user