Updating cache
This commit is contained in:
parent
06bc22ebe4
commit
08bd475950
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1 @@
|
|||||||
search-scrape
|
search-scrape
|
||||||
webcache
|
|
||||||
|
1
cache/.gitignore
vendored
Normal file
1
cache/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
webcache
|
1
cache/geocache.json
vendored
Normal file
1
cache/geocache.json
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"1-1-3 Sagamihara Chuo-ku Sagamihara Kanagawa":{"Latitude":35.5811327,"Longitude":139.3709098},"1-6-1 Kamiookanishi Konan-ku Yokohama Kanagawa":{"Latitude":35.4072739,"Longitude":139.5961112},"16-1 Totsukacho Totsuka-ku Yokohama Kanagawa":{"Latitude":35.4015872,"Longitude":139.5332269},"2-14-1 Minamisaiwai Nishi-ku Yokohama Kanagawa":{"Latitude":35.4640304,"Longitude":139.6177293},"200 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4431273,"Longitude":139.6434308},"3-43-4 Wakabacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4412748,"Longitude":139.6275875},"4-14-25 Chuorinkan Yamato Kanagawa":{"Latitude":35.5080078,"Longitude":139.4455886},"4-2-1 Kamikodanaka Nakahara-ku Kawasaki Kanagawa":{"Latitude":35.5813758,"Longitude":139.6409824},"61-1 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4438856,"Longitude":139.6488554},"Takashima Nishi-ku Yokohama Kanagawa":{"Latitude":35.4622889,"Longitude":139.6222899}}
|
8480
data/tabelog.json
8480
data/tabelog.json
File diff suppressed because it is too large
Load Diff
97
geocache.go
Normal file
97
geocache.go
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
|
||||||
|
* Author: Alex Yatskov <alex@foosoft.net>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
* this software and associated documentation files (the "Software"), to deal in
|
||||||
|
* the Software without restriction, including without limitation the rights to
|
||||||
|
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||||
|
* the Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/kellydunn/golang-geo"
|
||||||
|
)
|
||||||
|
|
||||||
|
type geoCoord struct {
|
||||||
|
Latitude float64
|
||||||
|
Longitude float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type geoCache struct {
|
||||||
|
cacheFile string
|
||||||
|
addressCache map[string]geoCoord
|
||||||
|
geocoder geo.GoogleGeocoder
|
||||||
|
mutex sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func newGeoCache(cacheFile string) (*geoCache, error) {
|
||||||
|
cache := &geoCache{
|
||||||
|
cacheFile: cacheFile,
|
||||||
|
addressCache: make(map[string]geoCoord)}
|
||||||
|
|
||||||
|
if err := cache.load(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return cache, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *geoCache) load() error {
|
||||||
|
file, err := os.Open(c.cacheFile)
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
return json.NewDecoder(file).Decode(&c.addressCache)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *geoCache) save() error {
|
||||||
|
js, err := json.MarshalIndent(c.addressCache, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ioutil.WriteFile(c.cacheFile, js, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *geoCache) decode(address string) (geoCoord, error) {
|
||||||
|
if coord, ok := c.addressCache[address]; ok {
|
||||||
|
return coord, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
point, err := c.geocoder.Geocode(address)
|
||||||
|
if err != nil {
|
||||||
|
return geoCoord{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
coord := geoCoord{point.Lat(), point.Lng()}
|
||||||
|
|
||||||
|
c.mutex.Lock()
|
||||||
|
c.addressCache[address] = coord
|
||||||
|
c.mutex.Unlock()
|
||||||
|
|
||||||
|
return coord, nil
|
||||||
|
}
|
@ -23,5 +23,5 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "webcache")
|
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "cache/webcache", "cache/geocache.json")
|
||||||
}
|
}
|
||||||
|
52
tabelog.go
52
tabelog.go
@ -41,15 +41,18 @@ type tabelogParams struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type tabelogReview struct {
|
type tabelogReview struct {
|
||||||
Name string
|
Name string
|
||||||
Address string
|
Address string
|
||||||
|
Url string
|
||||||
|
|
||||||
Dishes float64
|
Dishes float64
|
||||||
Service float64
|
Service float64
|
||||||
Atmosphere float64
|
Atmosphere float64
|
||||||
Cost float64
|
Cost float64
|
||||||
Drinks float64
|
Drinks float64
|
||||||
Url string
|
|
||||||
Raw string
|
Latitude float64
|
||||||
|
Longitude float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeAbsUrl(base, ref string) string {
|
func makeAbsUrl(base, ref string) string {
|
||||||
@ -91,10 +94,10 @@ func dumpReviews(filename string, rc chan tabelogReview, wg *sync.WaitGroup) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
|
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache, gc *geoCache) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
doc, err := wc.fetchUrl(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -126,11 +129,19 @@ func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *web
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
coord, err := gc.decode(review.Address)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
review.Latitude = coord.Latitude
|
||||||
|
review.Longitude = coord.Longitude
|
||||||
|
|
||||||
rc <- review
|
rc <- review
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, gc *geoCache) {
|
||||||
doc, err := wc.fetchUrl(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -139,18 +150,23 @@ func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
|||||||
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
||||||
if href, ok := sel.Attr("href"); ok {
|
if href, ok := sel.Attr("href"); ok {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
|
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc, gc)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
// if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||||
scrapeIndex(makeAbsUrl(url, href), out, wc)
|
// scrapeIndex(makeAbsUrl(url, href), out, wc, gc)
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeTabelog(url, jsonFile, cacheDir string) {
|
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
||||||
wc, err := newWebCache(cacheDir)
|
wc, err := newWebCache(webCacheDir)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gc, err := newGeoCache(geoCacheFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -158,9 +174,13 @@ func scrapeTabelog(url, jsonFile, cacheDir string) {
|
|||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
rc := make(chan tabelogReview)
|
rc := make(chan tabelogReview)
|
||||||
go dumpReviews(jsonFile, rc, &wg)
|
go dumpReviews(resultFile, rc, &wg)
|
||||||
|
|
||||||
scrapeIndex(url, rc, wc)
|
scrapeIndex(url, rc, wc, gc)
|
||||||
|
|
||||||
|
if err := gc.save(); err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
close(rc)
|
close(rc)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
@ -52,7 +52,7 @@ func (c *webCache) urlToLocal(url string) string {
|
|||||||
return path.Join(c.cacheDir, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
return path.Join(c.cacheDir, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *webCache) fetchUrl(url string) (*goquery.Document, error) {
|
func (c *webCache) load(url string) (*goquery.Document, error) {
|
||||||
localPath := c.urlToLocal(url)
|
localPath := c.urlToLocal(url)
|
||||||
|
|
||||||
if file, err := os.Open(localPath); err == nil {
|
if file, err := os.Open(localPath); err == nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user