1
This commit is contained in:
Alex Yatskov 2015-08-15 17:37:34 +09:00
parent 347ae73fe5
commit 6ae001c044
4 changed files with 5833 additions and 5835 deletions

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
package geo package main
import ( import (
"encoding/json" "encoding/json"
@ -31,22 +31,22 @@ import (
"github.com/kellydunn/golang-geo" "github.com/kellydunn/golang-geo"
) )
type Coord struct { type geoPos struct {
Latitude float64 Latitude float64
Longitude float64 Longitude float64
} }
type Cache struct { type geoCache struct {
filename string filename string
data map[string]Coord data map[string]geoPos
ticker *time.Ticker ticker *time.Ticker
coder geo.GoogleGeocoder coder geo.GoogleGeocoder
} }
func NewCache(filename string) (*Cache, error) { func newGeoCache(filename string) (*geoCache, error) {
cache := &Cache{ cache := &geoCache{
filename: filename, filename: filename,
data: make(map[string]Coord), data: make(map[string]geoPos),
ticker: time.NewTicker(time.Millisecond * 200), ticker: time.NewTicker(time.Millisecond * 200),
} }
@ -57,7 +57,7 @@ func NewCache(filename string) (*Cache, error) {
return cache, nil return cache, nil
} }
func (c *Cache) load() error { func (c *geoCache) load() error {
file, err := os.Open(c.filename) file, err := os.Open(c.filename)
if os.IsNotExist(err) { if os.IsNotExist(err) {
return nil return nil
@ -70,7 +70,7 @@ func (c *Cache) load() error {
return json.NewDecoder(file).Decode(&c.data) return json.NewDecoder(file).Decode(&c.data)
} }
func (c *Cache) Save() error { func (c *geoCache) save() error {
js, err := json.MarshalIndent(c.data, "", " ") js, err := json.MarshalIndent(c.data, "", " ")
if err != nil { if err != nil {
return err return err
@ -79,19 +79,19 @@ func (c *Cache) Save() error {
return ioutil.WriteFile(c.filename, js, 0644) return ioutil.WriteFile(c.filename, js, 0644)
} }
func (c *Cache) Decode(address string) (Coord, error) { func (c *geoCache) decode(address string) (geoPos, error) {
if coord, ok := c.data[address]; ok { if pos, ok := c.data[address]; ok {
return coord, nil return pos, nil
} }
<-c.ticker.C <-c.ticker.C
point, err := c.coder.Geocode(address) point, err := c.coder.Geocode(address)
if err != nil { if err != nil {
return Coord{}, err return geoPos{}, err
} }
coord := Coord{point.Lat(), point.Lng()} pos := geoPos{point.Lat(), point.Lng()}
c.data[address] = coord c.data[address] = pos
return coord, nil return pos, nil
} }

View File

@ -31,8 +31,6 @@ import (
"strings" "strings"
"sync" "sync"
"github.com/FooSoft/scrape/geo"
"github.com/FooSoft/scrape/web"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
@ -91,13 +89,13 @@ func dumpReviews(filename string, in chan tabelogReview) error {
return nil return nil
} }
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geo.Cache) { func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
for { for {
if review, ok := <-in; ok { if review, ok := <-in; ok {
coord, err := gc.Decode(review.Address) pos, err := gc.decode(review.Address)
if err == nil { if err == nil {
review.Latitude = coord.Latitude review.Latitude = pos.Latitude
review.Longitude = coord.Longitude review.Longitude = pos.Longitude
out <- review out <- review
} else { } else {
log.Printf("failed to decode address for %s (%v)", review.Url, err) log.Printf("failed to decode address for %s (%v)", review.Url, err)
@ -109,10 +107,10 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geo.Cache)
} }
} }
func scrapeReview(url string, out chan tabelogReview, wc *web.Cache, wg *sync.WaitGroup) { func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
defer wg.Done() defer wg.Done()
doc, err := wc.Load(url) doc, err := wc.load(url)
if err != nil { if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err) log.Printf("failed to scrape review at %s (%v)", url, err)
return return
@ -148,8 +146,8 @@ func scrapeReview(url string, out chan tabelogReview, wc *web.Cache, wg *sync.Wa
out <- review out <- review
} }
func scrapeIndex(url string, out chan tabelogReview, wc *web.Cache, wg *sync.WaitGroup) { func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
doc, err := wc.Load(url) doc, err := wc.load(url)
if err != nil { if err != nil {
log.Printf("failed to scrape index at %s (%v)", url, err) log.Printf("failed to scrape index at %s (%v)", url, err)
return return
@ -178,7 +176,7 @@ func scrapeIndex(url string, out chan tabelogReview, wc *web.Cache, wg *sync.Wai
} }
} }
func scrapeReviews(url string, out chan tabelogReview, wc *web.Cache) error { func scrapeReviews(url string, out chan tabelogReview, wc *webCache) error {
var wg sync.WaitGroup var wg sync.WaitGroup
scrapeIndex(url, out, wc, &wg) scrapeIndex(url, out, wc, &wg)
wg.Wait() wg.Wait()
@ -188,12 +186,12 @@ func scrapeReviews(url string, out chan tabelogReview, wc *web.Cache) error {
} }
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error { func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error {
wc, err := web.NewCache(webCacheDir) wc, err := newWebCache(webCacheDir)
if err != nil { if err != nil {
return err return err
} }
gc, err := geo.NewCache(geoCacheFile) gc, err := newGeoCache(geoCacheFile)
if err != nil { if err != nil {
return err return err
} }
@ -205,5 +203,5 @@ func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error {
scrapeReviews(url, scrapeChan, wc) scrapeReviews(url, scrapeChan, wc)
dumpReviews(resultFile, decodeChan) dumpReviews(resultFile, decodeChan)
return gc.Save() return gc.save()
} }

View File

@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
package web package main
import ( import (
"bytes" "bytes"
@ -35,17 +35,17 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type Cache struct { type webCache struct {
directory string directory string
ticker *time.Ticker ticker *time.Ticker
} }
func NewCache(directory string) (*Cache, error) { func newWebCache(directory string) (*webCache, error) {
if err := os.MkdirAll(directory, 0755); err != nil { if err := os.MkdirAll(directory, 0755); err != nil {
return nil, err return nil, err
} }
cache := &Cache{ cache := &webCache{
directory: directory, directory: directory,
ticker: time.NewTicker(time.Millisecond * 100), ticker: time.NewTicker(time.Millisecond * 100),
} }
@ -53,13 +53,13 @@ func NewCache(directory string) (*Cache, error) {
return cache, nil return cache, nil
} }
func (c *Cache) urlToLocal(url string) string { func (c *webCache) urlToLocal(url string) string {
hash := md5.New() hash := md5.New()
hash.Write([]byte(url)) hash.Write([]byte(url))
return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil))) return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil)))
} }
func (c *Cache) Load(url string) (*goquery.Document, error) { func (c *webCache) load(url string) (*goquery.Document, error) {
localPath := c.urlToLocal(url) localPath := c.urlToLocal(url)
if file, err := os.Open(localPath); err == nil { if file, err := os.Open(localPath); err == nil {