WIP
This commit is contained in:
parent
c92823220b
commit
b7a07980e5
@ -26,8 +26,6 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/kellydunn/golang-geo"
|
"github.com/kellydunn/golang-geo"
|
||||||
)
|
)
|
||||||
@ -40,7 +38,6 @@ type geoCoord struct {
|
|||||||
type geoCache struct {
|
type geoCache struct {
|
||||||
filename string
|
filename string
|
||||||
data map[string]geoCoord
|
data map[string]geoCoord
|
||||||
mutex sync.Mutex
|
|
||||||
coder geo.GoogleGeocoder
|
coder geo.GoogleGeocoder
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -89,11 +86,6 @@ func (c *geoCache) decode(address string) (geoCoord, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
coord := geoCoord{point.Lat(), point.Lng()}
|
coord := geoCoord{point.Lat(), point.Lng()}
|
||||||
|
|
||||||
c.mutex.Lock()
|
|
||||||
c.data[address] = coord
|
c.data[address] = coord
|
||||||
c.mutex.Unlock()
|
|
||||||
|
|
||||||
time.Sleep(200 * time.Millisecond)
|
|
||||||
return coord, nil
|
return coord, nil
|
||||||
}
|
}
|
||||||
|
69
tabelog.go
69
tabelog.go
@ -34,8 +34,6 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
const ()
|
|
||||||
|
|
||||||
type tabelogParams struct {
|
type tabelogParams struct {
|
||||||
Page int
|
Page int
|
||||||
}
|
}
|
||||||
@ -69,16 +67,13 @@ func makeAbsUrl(base, ref string) string {
|
|||||||
return b.ResolveReference(r).String()
|
return b.ResolveReference(r).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func dumpReviews(filename string, rc chan tabelogReview, wg *sync.WaitGroup) {
|
func dumpReviews(filename string, in chan tabelogReview, wg *sync.WaitGroup) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
count := 1
|
|
||||||
var reviews []tabelogReview
|
var reviews []tabelogReview
|
||||||
for {
|
for {
|
||||||
if review, ok := <-rc; ok {
|
if review, ok := <-in; ok {
|
||||||
log.Printf("%s (%d)", review.Name, count)
|
|
||||||
reviews = append(reviews, review)
|
reviews = append(reviews, review)
|
||||||
count++
|
|
||||||
} else {
|
} else {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -94,7 +89,29 @@ func dumpReviews(filename string, rc chan tabelogReview, wg *sync.WaitGroup) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *webCache, gc *geoCache) {
|
func decodeReviews(in chan tabelogReview, out chan tabelogReview, wg *sync.WaitGroup, gc *geoCache) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
for {
|
||||||
|
if review, ok := <-in; ok {
|
||||||
|
log.Print("decoding %s", review.Name)
|
||||||
|
|
||||||
|
coord, err := gc.decode(review.Address)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
review.Latitude = coord.Latitude
|
||||||
|
review.Longitude = coord.Longitude
|
||||||
|
|
||||||
|
out <- review
|
||||||
|
} else {
|
||||||
|
close(out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func scrapeReview(url string, out chan tabelogReview, wg *sync.WaitGroup, wc *webCache) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
doc, err := wc.load(url)
|
doc, err := wc.load(url)
|
||||||
@ -129,23 +146,10 @@ func scrapeReview(url string, rc chan tabelogReview, wg *sync.WaitGroup, wc *web
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
coord, err := gc.decode(review.Address)
|
out <- review
|
||||||
if err != nil {
|
|
||||||
switch err.Error() {
|
|
||||||
case "ZERO_RESULTS":
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
review.Latitude = coord.Latitude
|
|
||||||
review.Longitude = coord.Longitude
|
|
||||||
|
|
||||||
rc <- review
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeIndex(url string, out chan tabelogReview, wc *webCache, gc *geoCache) {
|
func scrapeIndex(url string, out chan tabelogReview, wc *webCache) {
|
||||||
doc, err := wc.load(url)
|
doc, err := wc.load(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
@ -155,13 +159,13 @@ func scrapeIndex(url string, out chan tabelogReview, wc *webCache, gc *geoCache)
|
|||||||
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
|
||||||
if href, ok := sel.Attr("href"); ok {
|
if href, ok := sel.Attr("href"); ok {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc, gc)
|
go scrapeReview(makeAbsUrl(url, href), out, &wg, wc)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
|
||||||
scrapeIndex(makeAbsUrl(url, href), out, wc, gc)
|
scrapeIndex(makeAbsUrl(url, href), out, wc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,17 +180,20 @@ func scrapeTabelog(url, resultFile, webCacheDir, geoCacheFile string) {
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
scrapeChan := make(chan tabelogReview)
|
||||||
wg.Add(1)
|
decodeChan := make(chan tabelogReview)
|
||||||
rc := make(chan tabelogReview)
|
|
||||||
go dumpReviews(resultFile, rc, &wg)
|
|
||||||
|
|
||||||
scrapeIndex(url, rc, wc, gc)
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(2)
|
||||||
|
go decodeReviews(scrapeChan, decodeChan, &wg, gc)
|
||||||
|
go dumpReviews(resultFile, decodeChan, &wg)
|
||||||
|
|
||||||
|
scrapeIndex(url, scrapeChan, wc)
|
||||||
|
close(scrapeChan)
|
||||||
|
|
||||||
if err := gc.save(); err != nil {
|
if err := gc.save(); err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
close(rc)
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
14
webcache.go
14
webcache.go
@ -35,21 +35,21 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type webCache struct {
|
type webCache struct {
|
||||||
cacheDir string
|
directory string
|
||||||
}
|
}
|
||||||
|
|
||||||
func newWebCache(cacheDir string) (*webCache, error) {
|
func newWebCache(directory string) (*webCache, error) {
|
||||||
if err := os.MkdirAll(cacheDir, 0755); err != nil {
|
if err := os.MkdirAll(directory, 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return &webCache{cacheDir: cacheDir}, nil
|
return &webCache{directory: directory}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *webCache) urlToLocal(url string) string {
|
func (c *webCache) urlToLocal(url string) string {
|
||||||
hash := md5.New()
|
hash := md5.New()
|
||||||
hash.Write([]byte(url))
|
hash.Write([]byte(url))
|
||||||
return path.Join(c.cacheDir, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil)))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *webCache) load(url string) (*goquery.Document, error) {
|
func (c *webCache) load(url string) (*goquery.Document, error) {
|
||||||
@ -58,7 +58,8 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
|
|||||||
if file, err := os.Open(localPath); err == nil {
|
if file, err := os.Open(localPath); err == nil {
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
return goquery.NewDocumentFromReader(file)
|
return goquery.NewDocumentFromReader(file)
|
||||||
} else {
|
}
|
||||||
|
|
||||||
res, err := http.Get(url)
|
res, err := http.Get(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -75,5 +76,4 @@ func (c *webCache) load(url string) (*goquery.Document, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return goquery.NewDocumentFromReader(&buff)
|
return goquery.NewDocumentFromReader(&buff)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user