From c4791a87e93026c6b08dec84b0226c98a8f429d7 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 20 Sep 2015 20:02:03 +0900 Subject: [PATCH] WIP --- build/build.go | 31 +---- build/data/tripadvisor.toml | 54 ++++++--- build/descriptor.go | 235 ++++++++++++++++++++++++++++++++++++ build/tabelog.go | 10 +- build/tripadvisor.go | 8 +- 5 files changed, 284 insertions(+), 54 deletions(-) create mode 100644 build/descriptor.go diff --git a/build/build.go b/build/build.go index 3926a0a..b12965d 100644 --- a/build/build.go +++ b/build/build.go @@ -52,29 +52,6 @@ func (s scrapeCtx) load(url string) (*goquery.Document, error) { return s.wc.load(url) } -type semantics struct { - accomodating float64 - affordable float64 - atmospheric float64 - delicious float64 -} - -func (s semantics) combine(other semantics, weight float64) semantics { - return semantics{ - s.accomodating + other.accomodating*weight, - s.affordable + other.affordable*weight, - s.atmospheric + other.atmospheric*weight, - s.delicious + other.delicious*weight} -} - -func (s semantics) reduce(weight float64) semantics { - return semantics{ - s.accomodating / weight, - s.affordable / weight, - s.atmospheric / weight, - s.delicious / weight} -} - type restaurant struct { name string address string @@ -251,10 +228,10 @@ func dumpData(dbPath string, restaraunts map[uint64]*restaurant) error { ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, rest.name, rest.address, - rest.sem.delicious, - rest.sem.accomodating, - rest.sem.affordable, - rest.sem.atmospheric, + rest.sem.Delicious, + rest.sem.Accomodating, + rest.sem.Affordable, + rest.sem.Atmospheric, rest.latitude, rest.longitude, rest.closestStnDist, diff --git a/build/data/tripadvisor.toml b/build/data/tripadvisor.toml index 28cc2d9..236c965 100644 --- a/build/data/tripadvisor.toml +++ b/build/data/tripadvisor.toml @@ -1,46 +1,64 @@ -[index] - [index.item] +name = "tripadvisor" + +[index.items] path = "a.property_title" attr = "href" - [index.next] +[index.next] path = "div.deckTools.btm a.nav.next.rndBtn.rndBtnGreen.taLnk" attr = "href" -[item] - scale = 5.0 - - [item.name] +[item.name] path = "h1#HEADING" - [item.address] +[item.address] path = "address span.format_address > span:not(.extended-address)" - [item.weight] +[item.weight] path = "h3.reviews_header" regex = "^(\d+)" - [item.semantics] - [item.semantics.service] +[item.semantics] + [item.semantics.service] + accomodating = 1.0 + affordable = 0.0 + atmospheric = 0.0 + delicious: 0.0 + scale = 5.0 + path = "ul.barChart div.ratingRow img:nth-child(2)" regex = "^([0-9]*\.?[0-9]+)" attr = "alt" - def = {"accomodating": 1.0, "affordable": 0.0, "atmospheric": 0.0, "delicious": 0.0} - [item.semantics.food] + [item.semantics.food] + accomodating = 0.0 + affordable = 0.0 + atmospheric = 0.0 + delicious: 1.0 + scale = 5.0 + path = "ul.barChart div.ratingRow img:nth-child(1)" regex = "^([0-9]*\.?[0-9]+)" attr = "alt" - def = {"accomodating": 0.0, "affordable": 0.0, "atmospheric": 0.0, "delicious": 1.0} - [item.semantics.value] + [item.semantics.value] + accomodating = 0.0 + affordable = 1.0 + atmospheric = 0.0 + delicious: 0.0 + scale = 5.0 + path = "ul.barChart div.ratingRow img:nth-child(3)" regex = "^([0-9]*\.?[0-9]+)" attr = "alt" - def = {"accomodating": 0.0, "affordable": 1.0, "atmospheric": 0.0, "delicious": 0.0} - [item.semantics.atmosphere] + [item.semantics.atmosphere] + accomodating = 0.0 + affordable = 0.0 + atmospheric = 1.0 + delicious: 0.0 + scale = 5.0 + path = "ul.barChart div.ratingRow img:nth-child(4)" regex = "^([0-9]*\.?[0-9]+)" attr = "alt" - def = {"accomodating": 0.0, "affordable": 0.0, "atmospheric": 1.0, "delicious": 0.0} diff --git a/build/descriptor.go b/build/descriptor.go new file mode 100644 index 0000000..219b7ac --- /dev/null +++ b/build/descriptor.go @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2015 Alex Yatskov + * Author: Alex Yatskov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import ( + "errors" + "fmt" + "io/ioutil" + "regexp" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/naoina/toml" +) + +// +// descriptor +// +type descriptor struct { + gc *geoCache + wc *webCache + + conf descConf +} + +// +// semantics +// +type semantics struct { + Accomodating float64 + Affordable float64 + Atmospheric float64 + Delicious float64 +} + +func (s semantics) combine(other semantics, weight float64) semantics { + return semantics{ + s.Accomodating + other.Accomodating*weight, + s.Affordable + other.Affordable*weight, + s.Atmospheric + other.Atmospheric*weight, + s.Delicious + other.Delicious*weight, + } +} + +func (s semantics) reduce(weight float64) semantics { + return semantics{ + s.Accomodating / weight, + s.Affordable / weight, + s.Atmospheric / weight, + s.Delicious / weight, + } +} + +// +// pathConf +// +type pathConf struct { + Path string + Attr string + RegEx string + + regExComp *regexp.Regexp +} + +func (c *pathConf) locateStrings(doc *goquery.Document) ([]string, error) { + var err error + if len(c.RegEx) > 0 && c.regExComp == nil { + if c.regExComp, err = regexp.Compile(c.RegEx); err != nil { + return nil, err + } + } + + var strs []string + doc.Find(c.Path).Each(func(index int, sel *goquery.Selection) { + var str string + if len(c.Attr) > 0 { + str, _ = sel.Attr(c.Attr) + } else { + str = sel.Text() + } + + if c.regExComp != nil { + if matches := c.regExComp.FindStringSubmatch(str); len(matches) > 1 { + str = matches[1] + } else { + str = "" + } + } + + strs = append(strs, str) + }) + + return strs, err + +} + +func (c *pathConf) locateString(doc *goquery.Document) (string, error) { + strs, err := c.locateStrings(doc) + if err != nil { + return "", err + } + + return strings.Join(strs, " "), nil +} + +func (c *pathConf) locateInt(doc *goquery.Document) (int64, error) { + str, err := c.locateString(doc) + if err != nil { + return 0, err + } + + return strconv.ParseInt(str, 10, 8) +} + +func (c *pathConf) locateFloat(doc *goquery.Document) (float64, error) { + str, err := c.locateString(doc) + if err != nil { + return 0.0, err + } + + return strconv.ParseFloat(str, 8) +} + +// +// semConf +// +type semConf struct { + semantics + pathConf + + Scale float64 +} + +// +// descConf +// +type descConf struct { + Index struct { + Items pathConf + Next pathConf + } + + Item struct { + Name pathConf + Address pathConf + Count pathConf + Sem map[string]semConf + } +} + +func newDescriptor(filename string, gc *geoCache, wc *webCache) (*descriptor, error) { + desc := &descriptor{gc: gc, wc: wc} + + bytes, err := ioutil.ReadFile(filename) + if err != nil { + return nil, err + } + + if err := toml.Unmarshal(bytes, &desc.conf); err != nil { + return nil, err + } + + return desc, nil +} + +func (d descriptor) define(keyword string) semantics { + return d.conf.Item.Sem[keyword].semantics +} + +func (d descriptor) index(doc *goquery.Document) (next string, items []string, err error) { + if items, err = d.conf.Index.Items.locateStrings(doc); err != nil { + return + } + + if next, err = d.conf.Index.Next.locateString(doc); err != nil { + return + } + + return +} + +func (d descriptor) review(doc *goquery.Document) (name, address string, features map[string]float64, count int64, err error) { + if name, err = d.conf.Item.Name.locateString(doc); err != nil || len(name) == 0 { + err = errors.New("invalid name") + return + } + + if address, err = d.conf.Item.Address.locateString(doc); err != nil || len(address) == 0 { + err = errors.New("invalid address") + return + } + + if count, err = d.conf.Item.Count.locateInt(doc); err != nil { + err = errors.New("invalid review count") + return + } + + features = make(map[string]float64) + for n, s := range d.conf.Item.Sem { + var value float64 + if value, err = s.pathConf.locateFloat(doc); err != nil { + err = fmt.Errorf("invalid feature value for %s", n) + return + } + + if s.Scale != 0.0 { + value /= s.Scale + } + + features[n] = value + } + + return +} diff --git a/build/tabelog.go b/build/tabelog.go index f801074..988967f 100644 --- a/build/tabelog.go +++ b/build/tabelog.go @@ -37,11 +37,11 @@ type tabelog struct { func (tabelog) define(keyword string) semantics { return map[string]semantics{ - "dishes": {accomodating: 0.0, affordable: 0.0, atmospheric: 0.0, delicious: 0.8}, - "drinks": {accomodating: 0.0, affordable: 0.0, atmospheric: 0.0, delicious: 0.2}, - "service": {accomodating: 1.0, affordable: 0.0, atmospheric: 0.0, delicious: 0.0}, - "cost": {accomodating: 0.0, affordable: 1.0, atmospheric: 0.0, delicious: 0.0}, - "atmosphere": {accomodating: 0.0, affordable: 0.0, atmospheric: 1.0, delicious: 0.0}, + "dishes": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.8}, + "drinks": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.2}, + "service": {Accomodating: 1.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.0}, + "cost": {Accomodating: 0.0, Affordable: 1.0, Atmospheric: 0.0, Delicious: 0.0}, + "atmosphere": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 1.0, Delicious: 0.0}, }[keyword] } diff --git a/build/tripadvisor.go b/build/tripadvisor.go index c5d41c2..69f3e36 100644 --- a/build/tripadvisor.go +++ b/build/tripadvisor.go @@ -37,10 +37,10 @@ type tripadvisor struct { func (tripadvisor) define(keyword string) semantics { return map[string]semantics{ - "food": {accomodating: 0.0, affordable: 0.0, atmospheric: 0.0, delicious: 1.0}, - "service": {accomodating: 1.0, affordable: 0.0, atmospheric: 0.0, delicious: 0.0}, - "value": {accomodating: 0.0, affordable: 1.0, atmospheric: 0.0, delicious: 0.0}, - "atmosphere": {accomodating: 0.0, affordable: 0.0, atmospheric: 1.0, delicious: 0.0}, + "food": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 1.0}, + "service": {Accomodating: 1.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.0}, + "value": {Accomodating: 0.0, Affordable: 1.0, Atmospheric: 0.0, Delicious: 0.0}, + "atmosphere": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 1.0, Delicious: 0.0}, }[keyword] }