Reorg
This commit is contained in:
parent
347ae73fe5
commit
6ae001c044
11598
data/tabelog.json
11598
data/tabelog.json
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@
|
|||||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package geo
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@ -31,22 +31,22 @@ import (
|
|||||||
"github.com/kellydunn/golang-geo"
|
"github.com/kellydunn/golang-geo"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Coord struct {
|
type geoPos struct {
|
||||||
Latitude float64
|
Latitude float64
|
||||||
Longitude float64
|
Longitude float64
|
||||||
}
|
}
|
||||||
|
|
||||||
type Cache struct {
|
type geoCache struct {
|
||||||
filename string
|
filename string
|
||||||
data map[string]Coord
|
data map[string]geoPos
|
||||||
ticker *time.Ticker
|
ticker *time.Ticker
|
||||||
coder geo.GoogleGeocoder
|
coder geo.GoogleGeocoder
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCache(filename string) (*Cache, error) {
|
func newGeoCache(filename string) (*geoCache, error) {
|
||||||
cache := &Cache{
|
cache := &geoCache{
|
||||||
filename: filename,
|
filename: filename,
|
||||||
data: make(map[string]Coord),
|
data: make(map[string]geoPos),
|
||||||
ticker: time.NewTicker(time.Millisecond * 200),
|
ticker: time.NewTicker(time.Millisecond * 200),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ func NewCache(filename string) (*Cache, error) {
|
|||||||
return cache, nil
|
return cache, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) load() error {
|
func (c *geoCache) load() error {
|
||||||
file, err := os.Open(c.filename)
|
file, err := os.Open(c.filename)
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
return nil
|
return nil
|
||||||
@ -70,7 +70,7 @@ func (c *Cache) load() error {
|
|||||||
return json.NewDecoder(file).Decode(&c.data)
|
return json.NewDecoder(file).Decode(&c.data)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) Save() error {
|
func (c *geoCache) save() error {
|
||||||
js, err := json.MarshalIndent(c.data, "", " ")
|
js, err := json.MarshalIndent(c.data, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -79,19 +79,19 @@ func (c *Cache) Save() error {
|
|||||||
return ioutil.WriteFile(c.filename, js, 0644)
|
return ioutil.WriteFile(c.filename, js, 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) Decode(address string) (Coord, error) {
|
func (c *geoCache) decode(address string) (geoPos, error) {
|
||||||
if coord, ok := c.data[address]; ok {
|
if pos, ok := c.data[address]; ok {
|
||||||
return coord, nil
|
return pos, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
<-c.ticker.C
|
<-c.ticker.C
|
||||||
|
|
||||||
point, err := c.coder.Geocode(address)
|
point, err := c.coder.Geocode(address)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Coord{}, err
|
return geoPos{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
coord := Coord{point.Lat(), point.Lng()}
|
pos := geoPos{point.Lat(), point.Lng()}
|
||||||
c.data[address] = coord
|
c.data[address] = pos
|
||||||
return coord, nil
|
return pos, nil
|
||||||
}
|
}
|
26
tabelog.go
26
tabelog.go
@ -31,8 +31,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/FooSoft/scrape/geo"
|
|
||||||
"github.com/FooSoft/scrape/web"
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -91,13 +89,13 @@ func dumpReviews(filename string, in chan tabelogReview) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geo.Cache) {
|
func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geoCache) {
|
||||||
for {
|
for {
|
||||||
if review, ok := <-in; ok {
|
if review, ok := <-in; ok {
|
||||||
coord, err := gc.Decode(review.Address)
|
pos, err := gc.decode(review.Address)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
review.Latitude = coord.Latitude
|
review.Latitude = pos.Latitude
|
||||||
review.Longitude = coord.Longitude
|
review.Longitude = pos.Longitude
|
||||||
out <- review
|
out <- review
|
||||||
} else {
|
} else {
|
||||||
log.Printf("failed to decode address for %s (%v)", review.Url, err)
|
log.Printf("failed to decode address for %s (%v)", review.Url, err)
|
||||||
@ -109,10 +107,10 @@ func decodeReviews(in chan tabelogReview, out chan tabelogReview, gc *geo.Cache)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, out chan tabelogReview, wc *web.Cache, wg *sync.WaitGroup) {
|
func scrapeReview(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
doc, err := wc.Load(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to scrape review at %s (%v)", url, err)
|
log.Printf("failed to scrape review at %s (%v)", url, err)
|
||||||
return
|
return
|
||||||
@ -148,8 +146,8 @@ func scrapeReview(url string, out chan tabelogReview, wc *web.Cache, wg *sync.Wa
|
|||||||
out <- review
|
out <- review
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(url string, out chan tabelogReview, wc *web.Cache, wg *sync.WaitGroup) {
|
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, wg *sync.WaitGroup) {
|
||||||
doc, err := wc.Load(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to scrape index at %s (%v)", url, err)
|
log.Printf("failed to scrape index at %s (%v)", url, err)
|
||||||
return
|
return
|
||||||
@ -178,7 +176,7 @@ func scrapeIndex(url string, out chan tabelogReview, wc *web.Cache, wg *sync.Wai
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReviews(url string, out chan tabelogReview, wc *web.Cache) error {
|
func scrapeReviews(url string, out chan tabelogReview, wc *webCache) error {
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
scrapeIndex(url, out, wc, &wg)
|
scrapeIndex(url, out, wc, &wg)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
@ -188,12 +186,12 @@ func scrapeReviews(url string, out chan tabelogReview, wc *web.Cache) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error {
|
func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error {
|
||||||
wc, err := web.NewCache(webCacheDir)
|
wc, err := newWebCache(webCacheDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
gc, err := geo.NewCache(geoCacheFile)
|
gc, err := newGeoCache(geoCacheFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -205,5 +203,5 @@ func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) error {
|
|||||||
scrapeReviews(url, scrapeChan, wc)
|
scrapeReviews(url, scrapeChan, wc)
|
||||||
dumpReviews(resultFile, decodeChan)
|
dumpReviews(resultFile, decodeChan)
|
||||||
|
|
||||||
return gc.Save()
|
return gc.save()
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package web
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
@ -35,17 +35,17 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Cache struct {
|
type webCache struct {
|
||||||
directory string
|
directory string
|
||||||
ticker *time.Ticker
|
ticker *time.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCache(directory string) (*Cache, error) {
|
func newWebCache(directory string) (*webCache, error) {
|
||||||
if err := os.MkdirAll(directory, 0755); err != nil {
|
if err := os.MkdirAll(directory, 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
cache := &Cache{
|
cache := &webCache{
|
||||||
directory: directory,
|
directory: directory,
|
||||||
ticker: time.NewTicker(time.Millisecond * 100),
|
ticker: time.NewTicker(time.Millisecond * 100),
|
||||||
}
|
}
|
||||||
@ -53,13 +53,13 @@ func NewCache(directory string) (*Cache, error) {
|
|||||||
return cache, nil
|
return cache, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) urlToLocal(url string) string {
|
func (c *webCache) urlToLocal(url string) string {
|
||||||
hash := md5.New()
|
hash := md5.New()
|
||||||
hash.Write([]byte(url))
|
hash.Write([]byte(url))
|
||||||
return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) Load(url string) (*goquery.Document, error) {
|
func (c *webCache) load(url string) (*goquery.Document, error) {
|
||||||
localPath := c.urlToLocal(url)
|
localPath := c.urlToLocal(url)
|
||||||
|
|
||||||
if file, err := os.Open(localPath); err == nil {
|
if file, err := os.Open(localPath); err == nil {
|
Loading…
Reference in New Issue
Block a user