1

Modifying data range

This commit is contained in:
Alex Yatskov 2015-08-24 17:03:00 +09:00
parent 90522c0f50
commit b5ee8380a0
4 changed files with 34 additions and 23 deletions

View File

@ -102,10 +102,6 @@ func computeStnData(restaurants []restaurant, stationsPath string) error {
return nil return nil
} }
func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) {
return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"]
}
func dumpData(dbPath string, restaraunts []restaurant) error { func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath) db, err := sql.Open("sqlite3", dbPath)
if err != nil { if err != nil {
@ -135,8 +131,6 @@ func dumpData(dbPath string, restaraunts []restaurant) error {
} }
for _, r := range restaraunts { for _, r := range restaraunts {
delicious, accommodating, affordable, atmospheric := buildFeatures(r)
_, err = db.Exec(` _, err = db.Exec(`
INSERT INTO reviews( INSERT INTO reviews(
name, name,
@ -153,10 +147,10 @@ func dumpData(dbPath string, restaraunts []restaurant) error {
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name, r.name,
r.url, r.url,
delicious, r.feats.delicious,
accommodating, r.feats.accommodating,
affordable, r.feats.affordable,
atmospheric, r.feats.atmospheric,
r.longitude, r.longitude,
r.latitude, r.latitude,
r.closestStnDist, r.closestStnDist,

View File

@ -30,12 +30,19 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type features struct {
delicious float64
accommodating float64
affordable float64
atmospheric float64
}
type restaurant struct { type restaurant struct {
name string name string
address string address string
url string url string
features map[string]float64 feats features
latitude float64 latitude float64
longitude float64 longitude float64
@ -46,7 +53,7 @@ type restaurant struct {
type scraper interface { type scraper interface {
index(doc *goquery.Document) (string, []string) index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]float64, error) review(doc *goquery.Document) (string, string, features, error)
} }
func makeAbsUrl(ref, base string) (string, error) { func makeAbsUrl(ref, base string) (string, error) {
@ -90,17 +97,17 @@ func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.Wai
return return
} }
name, address, features, err := scr.review(doc) name, address, feats, err := scr.review(doc)
if err != nil { if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err) log.Printf("failed to scrape review at %s (%v)", url, err)
return return
} }
out <- restaurant{ out <- restaurant{
name: name, name: name,
address: address, address: address,
features: features, feats: feats,
url: url} url: url}
} }
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {

View File

@ -50,7 +50,7 @@ func (tabelog) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls return nextIndexUrl, reviewUrls
} }
func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) { func (tabelog) review(doc *goquery.Document) (name, address string, feat features, err error) {
name = doc.Find("a.rd-header__rst-name-main").Text() name = doc.Find("a.rd-header__rst-name-main").Text()
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 { if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
@ -60,14 +60,19 @@ func (tabelog) review(doc *goquery.Document) (name, address string, features map
return return
} }
features = make(map[string]float64) f := make(map[string]float64)
for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} { for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} {
text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text() text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
if features[category], err = strconv.ParseFloat(text, 8); err != nil { if f[category], err = strconv.ParseFloat(text, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category) err = fmt.Errorf("invalid value for %s", category)
return return
} }
} }
feat.accommodating = f["service"]/2.5 - 1.0
feat.affordable = f["cost"]/2.5 - 1.0
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["dishes"]/2.5 - 1.0
return return
} }

View File

@ -50,7 +50,7 @@ func (tripadvisor) index(doc *goquery.Document) (string, []string) {
return nextIndexUrl, reviewUrls return nextIndexUrl, reviewUrls
} }
func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) { func (tripadvisor) review(doc *goquery.Document) (name, address string, feat features, err error) {
name = strings.TrimSpace(doc.Find("h1#HEADING").Text()) name = strings.TrimSpace(doc.Find("h1#HEADING").Text())
address = strings.TrimSpace(doc.Find("address span.format_address").Text()) address = strings.TrimSpace(doc.Find("address span.format_address").Text())
@ -60,15 +60,20 @@ func (tripadvisor) review(doc *goquery.Document) (name, address string, features
return return
} }
features = make(map[string]float64) f := make(map[string]float64)
for index, category := range []string{"food", "service", "value", "atmosphere"} { for index, category := range []string{"food", "service", "value", "atmosphere"} {
alt, _ := ratings.Eq(index).Attr("alt") alt, _ := ratings.Eq(index).Attr("alt")
rating := strings.Split(alt, " ")[0] rating := strings.Split(alt, " ")[0]
if features[category], err = strconv.ParseFloat(rating, 8); err != nil { if f[category], err = strconv.ParseFloat(rating, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category) err = fmt.Errorf("invalid value for %s", category)
return return
} }
} }
feat.accommodating = f["service"]/2.5 - 1.0
feat.affordable = f["value"]/2.5 - 1.0
feat.atmospheric = f["atmosphere"]/2.5 - 1.0
feat.delicious = f["food"]/2.5 - 1.0
return return
} }