1

Updating cache

This commit is contained in:
Alex Yatskov 2015-08-13 19:18:17 +09:00
parent 06bc22ebe4
commit 08bd475950
8 changed files with 191 additions and 8445 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
search-scrape
webcache

1
cache/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
webcache

1
cache/geocache.json vendored Normal file
View File

@ -0,0 +1 @@
{"1-1-3 Sagamihara Chuo-ku Sagamihara Kanagawa":{"Latitude":35.5811327,"Longitude":139.3709098},"1-6-1 Kamiookanishi Konan-ku Yokohama Kanagawa":{"Latitude":35.4072739,"Longitude":139.5961112},"16-1 Totsukacho Totsuka-ku Yokohama Kanagawa":{"Latitude":35.4015872,"Longitude":139.5332269},"2-14-1 Minamisaiwai Nishi-ku Yokohama Kanagawa":{"Latitude":35.4640304,"Longitude":139.6177293},"200 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4431273,"Longitude":139.6434308},"3-43-4 Wakabacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4412748,"Longitude":139.6275875},"4-14-25 Chuorinkan Yamato Kanagawa":{"Latitude":35.5080078,"Longitude":139.4455886},"4-2-1 Kamikodanaka Nakahara-ku Kawasaki Kanagawa":{"Latitude":35.5813758,"Longitude":139.6409824},"61-1 Yamashitacho Naka-ku Yokohama Kanagawa":{"Latitude":35.4438856,"Longitude":139.6488554},"Takashima Nishi-ku Yokohama Kanagawa":{"Latitude":35.4622889,"Longitude":139.6222899}}

File diff suppressed because it is too large Load Diff

97
geocache.go Normal file
View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"encoding/json"
"io/ioutil"
"os"
"sync"
"github.com/kellydunn/golang-geo"
)
type geoCoord struct {
Latitude float64
Longitude float64
}
type geoCache struct {
cacheFile string
addressCache map[string]geoCoord
geocoder geo.GoogleGeocoder
mutex sync.Mutex
}
func newGeoCache(cacheFile string) (*geoCache, error) {
cache := &geoCache{
cacheFile: cacheFile,
addressCache: make(map[string]geoCoord)}
if err := cache.load(); err != nil {
return nil, err
}
return cache, nil
}
func (c *geoCache) load() error {
file, err := os.Open(c.cacheFile)
if os.IsNotExist(err) {
return nil
}
if err != nil {
return err
}
defer file.Close()
return json.NewDecoder(file).Decode(&c.addressCache)
}
func (c *geoCache) save() error {
js, err := json.MarshalIndent(c.addressCache, "", " ")
if err != nil {
return err
}
return ioutil.WriteFile(c.cacheFile, js, 0644)
}
func (c *geoCache) decode(address string) (geoCoord, error) {
if coord, ok := c.addressCache[address]; ok {
return coord, nil
}
point, err := c.geocoder.Geocode(address)
if err != nil {
return geoCoord{}, err
}
coord := geoCoord{point.Lat(), point.Lng()}
c.mutex.Lock()
c.addressCache[address] = coord
c.mutex.Unlock()
return coord, nil
}

View File

@ -23,5 +23,5 @@
package main
func main() {
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "webcache")
scrapeTabelog("http://tabelog.com/en/kanagawa/rstLst/1/", "data/tabelog.json", "cache/webcache", "cache/geocache.json")
}

View File

@ -41,15 +41,18 @@ type tabelogParams struct {
}
type tabelogReview struct {
Name string
Address string
Name string
Address string
Url string
Dishes float64
Service float64
Atmosphere float64
Cost float64
Drinks float64
Url string
Raw string
Latitude float64
Longitude float64
}
func makeAbsUrl(base, ref string) string {
@ -91,10 +94,10 @@ func dumpReviews(filename string, rc chan tabelogReview, wg *sync.WaitGroup) {
}
}
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache, gc *geoCache) {
defer wg.Done()
doc, err := wc.fetchUrl(url)
doc, err := wc.load(url)
if err != nil {
log.Fatal(err)
}
@ -126,11 +129,19 @@ func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *web
return
}
coord, err := gc.decode(review.Address)
if err != nil {
log.Fatal(err)
}
review.Latitude = coord.Latitude
review.Longitude = coord.Longitude
rc <- review
}
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
doc, err := wc.fetchUrl(url)
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, gc *geoCache) {
doc, err := wc.load(url)
if err != nil {
log.Fatal(err)
}
@ -139,18 +150,23 @@ func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok {
wg.Add(1)
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc, gc)
}
})
wg.Wait()
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
scrapeIndex(makeAbsUrl(url, href), out, wc)
}
// if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
// scrapeIndex(makeAbsUrl(url, href), out, wc, gc)
// }
}
func scrapeTabelog(url, jsonFile, cacheDir string) {
wc, err := newWebCache(cacheDir)
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
wc, err := newWebCache(webCacheDir)
if err != nil {
log.Fatal(err)
}
gc, err := newGeoCache(geoCacheFile)
if err != nil {
log.Fatal(err)
}
@ -158,9 +174,13 @@ func scrapeTabelog(url, jsonFile, cacheDir string) {
var wg sync.WaitGroup
wg.Add(1)
rc := make(chan tabelogReview)
go dumpReviews(jsonFile, rc, &wg)
go dumpReviews(resultFile, rc, &wg)
scrapeIndex(url, rc, wc)
scrapeIndex(url, rc, wc, gc)
if err := gc.save(); err != nil {
log.Fatal(err)
}
close(rc)
wg.Wait()

View File

@ -52,7 +52,7 @@ func (c *webCache) urlToLocal(url string) string {
return path.Join(c.cacheDir, fmt.Sprintf("%x.html", hash.Sum(nil)))
}
func (c *webCache) fetchUrl(url string) (*goquery.Document, error) {
func (c *webCache) load(url string) (*goquery.Document, error) {
localPath := c.urlToLocal(url)
if file, err := os.Open(localPath); err == nil {