From 4cce5231032c81010054a1c14f340f46b9961650 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Tue, 22 Sep 2015 11:06:20 +0900 Subject: [PATCH] Updates --- build/build.go | 307 +++++++++++-------------------------------- build/converter.go | 42 ++++-- build/db.go | 201 ++++++++++++++++++++++++++++ build/scrape.go | 32 +++-- build/tabelog.go | 1 - build/tripadvisor.go | 1 - 6 files changed, 322 insertions(+), 262 deletions(-) create mode 100644 build/db.go diff --git a/build/build.go b/build/build.go index b12965d..1b41469 100644 --- a/build/build.go +++ b/build/build.go @@ -24,34 +24,14 @@ package main import ( "bufio" - "bytes" - "database/sql" - "encoding/binary" - "errors" "flag" - "hash/fnv" "log" - "net/url" "os" + "path/filepath" - "github.com/PuerkitoBio/goquery" - "github.com/fatih/color" _ "github.com/mattn/go-sqlite3" ) -type scrapeCtx struct { - gc *geoCache - wc *webCache -} - -func (s scrapeCtx) decode(address string) (float64, float64, error) { - return s.gc.decode(address) -} - -func (s scrapeCtx) load(url string) (*goquery.Document, error) { - return s.wc.load(url) -} - type restaurant struct { name string address string @@ -65,254 +45,119 @@ type restaurant struct { closestStnDist float64 } -func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { - gc, err := newGeoCache(geocachePath) - if err != nil { - return nil, err - } - defer gc.save() - - wc, err := newWebCache(webcachePath) +func loadConverters(directory string) ([]*converter, error) { + matches, err := filepath.Glob(filepath.Join(directory, "*.toml")) if err != nil { return nil, err } - file, err := os.Open(urlsPath) + var convs []*converter + for _, match := range matches { + conv, err := newConverter(match) + if err != nil { + return nil, err + } + + convs = append(convs, conv) + } + + return convs, nil +} + +func loadUrls(filename string) ([]string, error) { + file, err := os.Open(filename) if err != nil { return nil, err } defer file.Close() - ctx := scrapeCtx{gc, wc} - tlog := tabelog{scrapeCtx: ctx} - tadv := tripadvisor{scrapeCtx: ctx} - - var reviews []review + var urls []string for scanner := bufio.NewScanner(file); scanner.Scan(); { - if line := scanner.Text(); len(line) > 0 { - parsed, err := url.Parse(line) - if err != nil { - return nil, err - } - - var revs []review - switch parsed.Host { - case "tabelog.com": - revs, err = scrape(line, tlog) - case "www.tripadvisor.com": - revs, err = scrape(line, tadv) - default: - err = errors.New("unsupported review site") + if url := scanner.Text(); len(url) > 0 { + urls = append(urls, url) + } + } + + return urls, nil +} + +func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *webCache) ([]review, error) { + var reviews []review + + for _, u := range urls { + for _, c := range converters { + if !c.compatible(u) { + continue } + revs, err := scrape(u, c, gc, wc) if err != nil { return nil, err } reviews = append(reviews, revs...) + break } } return reviews, nil } -func collateData(reviews []review) map[uint64]*restaurant { - restaurants := make(map[uint64]*restaurant) - - for _, rev := range reviews { - var buff bytes.Buffer - binary.Write(&buff, binary.LittleEndian, rev.latitude) - binary.Write(&buff, binary.LittleEndian, rev.longitude) - binary.Write(&buff, binary.LittleEndian, rev.name) - - hash := fnv.New64() - hash.Write(buff.Bytes()) - - var rest *restaurant - if rest, _ = restaurants[hash.Sum64()]; rest == nil { - rest = &restaurant{name: rev.name, address: rev.address, latitude: rev.latitude, longitude: rev.longitude} - restaurants[hash.Sum64()] = rest - } - - rest.reviews = append(rest.reviews, rev) - } - - return restaurants -} - -func computeStations(restaurants map[uint64]*restaurant, stationsPath string) error { - sq, err := newStationQuery(stationsPath) - if err != nil { - return err - } - - for _, rest := range restaurants { - rest.closestStnName, rest.closestStnDist = sq.closestStation(rest.latitude, rest.longitude) - } - - return nil -} - -func computeSemantics(restaraunts map[uint64]*restaurant) { - type definer interface { - define(keyword string) semantics - } - - for _, rest := range restaraunts { - var ( - sem semantics - weight float64 - ) - - for _, rev := range rest.reviews { - def, ok := rev.scr.(definer) - if !ok { - continue - } - - for name, value := range rev.features { - sem = sem.combine(def.define(name), rev.weight*value) - } - - weight += rev.weight - } - - if weight > 0.0 { - rest.sem = sem.reduce(weight) - } - } -} - -func dumpData(dbPath string, restaraunts map[uint64]*restaurant) error { - db, err := sql.Open("sqlite3", dbPath) - if err != nil { - return err - } - defer db.Close() - - _, err = db.Exec(` - DROP TABLE IF EXISTS reviews; - CREATE TABLE reviews( - name VARCHAR(100) NOT NULL, - address VARCHAR(400) NOT NULL, - delicious FLOAT NOT NULL, - accommodating FLOAT NOT NULL, - affordable FLOAT NOT NULL, - atmospheric FLOAT NOT NULL, - latitude FLOAT NOT NULL, - longitude FLOAT NOT NULL, - closestStnDist FLOAT NOT NULL, - closestStnName VARCHAR(100) NOT NULL, - accessCount INTEGER NOT NULL, - id INTEGER PRIMARY KEY - )`) - - if err != nil { - return err - } - - for _, rest := range restaraunts { - _, err = db.Exec(` - INSERT INTO reviews( - name, - address, - delicious, - accommodating, - affordable, - atmospheric, - latitude, - longitude, - closestStnDist, - closestStnName, - accessCount - ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - rest.name, - rest.address, - rest.sem.Delicious, - rest.sem.Accomodating, - rest.sem.Affordable, - rest.sem.Atmospheric, - rest.latitude, - rest.longitude, - rest.closestStnDist, - rest.closestStnName, - 0) - - if err != nil { - return err - } - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS categories; - CREATE TABLE categories( - description VARCHAR(200) NOT NULL, - id INTEGER PRIMARY KEY)`) - - if err != nil { - return err - } - - for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { - if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { - return err - } - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS history; - CREATE TABLE history( - date DATETIME NOT NULL, - reviewId INTEGER NOT NULL, - id INTEGER PRIMARY KEY, - FOREIGN KEY(reviewId) REFERENCES reviews(id))`) - - if err != nil { - return err - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS historyGroups; - CREATE TABLE historyGroups( - categoryId INTEGER NOT NULL, - categoryValue FLOAT NOT NULL, - historyId INTEGER NOT NULL, - FOREIGN KEY(historyId) REFERENCES history(id), - FOREIGN KEY(categoryId) REFERENCES categories(id))`) - - if err != nil { - return err - } - - return nil -} - func main() { - dbPath := flag.String("db", "data/db.sqlite3", "database output path") - urlsPath := flag.String("urls", "data/urls.txt", "index URLs to scrape") - stationsPath := flag.String("stations", "data/stations.json", "station geolocation data") - geocachePath := flag.String("geocache", "cache/geocache.json", "geolocation data cache") - webcachePath := flag.String("webcache", "cache/webcache", "web data cache") + var ( + dbPath = flag.String("db", "data/db.sqlite3", "database output path") + urlsPath = flag.String("urls", "data/urls.txt", "index URLs to scrape") + convertersPath = flag.String("converters", "data/converters", "directory for converters") + stationsPath = flag.String("stations", "data/stations.json", "station geolocation data") + geocachePath = flag.String("geocache", "cache/geocache.json", "geolocation data cache") + webcachePath = flag.String("webcache", "cache/webcache", "web data cache") + ) + flag.Parse() - log.Print(color.BlueString("scraping data...")) - reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath) + log.Printf("loading geocache %s...", *geocachePath) + gc, err := newGeoCache(*geocachePath) + if err != nil { + log.Fatal(err) + } + defer gc.save() + + log.Printf("loading webcache %s...", *webcachePath) + wc, err := newWebCache(*webcachePath) if err != nil { log.Fatal(err) } - log.Print(color.BlueString("collating data...")) + log.Printf("loading urls %s...", *urlsPath) + urls, err := loadUrls(*urlsPath) + if err != nil { + log.Fatal(err) + } + + log.Printf("loading converters %s...", *convertersPath) + converters, err := loadConverters(*convertersPath) + if err != nil { + log.Fatal(err) + } + + log.Print("scraping reviews...") + reviews, err := scrapeReviews(urls, converters, gc, wc) + if err != nil { + log.Fatal(err) + } + + log.Print("collating data...") restaurants := collateData(reviews) - log.Print(color.BlueString("computing data semantics..")) + log.Print("computing data semantics..") computeSemantics(restaurants) - log.Print(color.BlueString("computing station data...")) + log.Print("computing station data...") if err := computeStations(restaurants, *stationsPath); err != nil { log.Fatal(err) } - log.Print(color.BlueString("saving data...")) + log.Print("saving data...") if err := dumpData(*dbPath, restaurants); err != nil { log.Fatal(err) } diff --git a/build/converter.go b/build/converter.go index edeb505..7ae7310 100644 --- a/build/converter.go +++ b/build/converter.go @@ -26,6 +26,7 @@ import ( "errors" "fmt" "io/ioutil" + "net/url" "regexp" "strconv" "strings" @@ -63,9 +64,9 @@ func (s semantics) reduce(weight float64) semantics { } // -// locator +// selector // -type locator struct { +type selector struct { Path string Attr string RegEx string @@ -73,7 +74,7 @@ type locator struct { regExComp *regexp.Regexp } -func (l *locator) locateStrings(doc *goquery.Document) ([]string, error) { +func (l *selector) locateStrings(doc *goquery.Document) ([]string, error) { var err error if len(l.RegEx) > 0 && l.regExComp == nil { if l.regExComp, err = regexp.Compile(l.RegEx); err != nil { @@ -105,7 +106,7 @@ func (l *locator) locateStrings(doc *goquery.Document) ([]string, error) { } -func (l *locator) locateString(doc *goquery.Document) (string, error) { +func (l *selector) locateString(doc *goquery.Document) (string, error) { strs, err := l.locateStrings(doc) if err != nil { return "", err @@ -114,7 +115,7 @@ func (l *locator) locateString(doc *goquery.Document) (string, error) { return strings.Join(strs, " "), nil } -func (l *locator) locateInt(doc *goquery.Document) (int64, error) { +func (l *selector) locateInt(doc *goquery.Document) (int64, error) { str, err := l.locateString(doc) if err != nil { return 0, err @@ -123,7 +124,7 @@ func (l *locator) locateInt(doc *goquery.Document) (int64, error) { return strconv.ParseInt(str, 10, 8) } -func (l *locator) locateFloat(doc *goquery.Document) (float64, error) { +func (l *selector) locateFloat(doc *goquery.Document) (float64, error) { str, err := l.locateString(doc) if err != nil { return 0.0, err @@ -136,18 +137,20 @@ func (l *locator) locateFloat(doc *goquery.Document) (float64, error) { // converter // type converter struct { + Domains []string + Index struct { - Items locator - Next locator + Items selector + Next selector } Item struct { - Name locator - Address locator - Count locator + Name selector + Address selector + Count selector Props map[string]struct { semantics - locator + selector Scale float64 } } @@ -171,6 +174,21 @@ func (c converter) define(keyword string) semantics { return c.Item.Props[keyword].semantics } +func (c converter) compatible(address string) bool { + parsed, err := url.Parse(address) + if err != nil { + return false + } + + for _, d := range c.Domains { + if d == parsed.Host { + return true + } + } + + return false +} + func (c converter) index(doc *goquery.Document) (next string, items []string, err error) { if items, err = c.Index.Items.locateStrings(doc); err != nil { return diff --git a/build/db.go b/build/db.go new file mode 100644 index 0000000..1f309ec --- /dev/null +++ b/build/db.go @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2015 Alex Yatskov + * Author: Alex Yatskov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import ( + "bytes" + "database/sql" + "encoding/binary" + "hash/fnv" +) + +func collateData(reviews []review) map[uint64]*restaurant { + restaurants := make(map[uint64]*restaurant) + + for _, rev := range reviews { + var buff bytes.Buffer + binary.Write(&buff, binary.LittleEndian, rev.latitude) + binary.Write(&buff, binary.LittleEndian, rev.longitude) + binary.Write(&buff, binary.LittleEndian, rev.name) + + hash := fnv.New64() + hash.Write(buff.Bytes()) + + var rest *restaurant + if rest, _ = restaurants[hash.Sum64()]; rest == nil { + rest = &restaurant{name: rev.name, address: rev.address, latitude: rev.latitude, longitude: rev.longitude} + restaurants[hash.Sum64()] = rest + } + + rest.reviews = append(rest.reviews, rev) + } + + return restaurants +} + +func computeSemantics(restaraunts map[uint64]*restaurant) { + type definer interface { + define(keyword string) semantics + } + + for _, rest := range restaraunts { + var ( + sem semantics + count int64 + ) + + for _, rev := range rest.reviews { + def, ok := rev.scr.(definer) + if !ok { + continue + } + + for name, value := range rev.features { + sem = sem.combine(def.define(name), float64(rev.count)*value) + } + + count += rev.count + } + + if count > 0 { + rest.sem = sem.reduce(float64(count)) + } + } +} + +func computeStations(restaurants map[uint64]*restaurant, stationsPath string) error { + sq, err := newStationQuery(stationsPath) + if err != nil { + return err + } + + for _, rest := range restaurants { + rest.closestStnName, rest.closestStnDist = sq.closestStation(rest.latitude, rest.longitude) + } + + return nil +} + +func dumpData(dbPath string, restaraunts map[uint64]*restaurant) error { + db, err := sql.Open("sqlite3", dbPath) + if err != nil { + return err + } + defer db.Close() + + _, err = db.Exec(` + DROP TABLE IF EXISTS reviews; + CREATE TABLE reviews( + name VARCHAR(100) NOT NULL, + address VARCHAR(400) NOT NULL, + delicious FLOAT NOT NULL, + accommodating FLOAT NOT NULL, + affordable FLOAT NOT NULL, + atmospheric FLOAT NOT NULL, + latitude FLOAT NOT NULL, + longitude FLOAT NOT NULL, + closestStnDist FLOAT NOT NULL, + closestStnName VARCHAR(100) NOT NULL, + accessCount INTEGER NOT NULL, + id INTEGER PRIMARY KEY + )`) + + if err != nil { + return err + } + + for _, rest := range restaraunts { + _, err = db.Exec(` + INSERT INTO reviews( + name, + address, + delicious, + accommodating, + affordable, + atmospheric, + latitude, + longitude, + closestStnDist, + closestStnName, + accessCount + ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + rest.name, + rest.address, + rest.sem.Delicious, + rest.sem.Accomodating, + rest.sem.Affordable, + rest.sem.Atmospheric, + rest.latitude, + rest.longitude, + rest.closestStnDist, + rest.closestStnName, + 0) + + if err != nil { + return err + } + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS categories; + CREATE TABLE categories( + description VARCHAR(200) NOT NULL, + id INTEGER PRIMARY KEY)`) + + if err != nil { + return err + } + + for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { + if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { + return err + } + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS history; + CREATE TABLE history( + date DATETIME NOT NULL, + reviewId INTEGER NOT NULL, + id INTEGER PRIMARY KEY, + FOREIGN KEY(reviewId) REFERENCES reviews(id))`) + + if err != nil { + return err + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS historyGroups; + CREATE TABLE historyGroups( + categoryId INTEGER NOT NULL, + categoryValue FLOAT NOT NULL, + historyId INTEGER NOT NULL, + FOREIGN KEY(historyId) REFERENCES history(id), + FOREIGN KEY(categoryId) REFERENCES categories(id))`) + + if err != nil { + return err + } + + return nil +} diff --git a/build/scrape.go b/build/scrape.go index cd54652..0137d44 100644 --- a/build/scrape.go +++ b/build/scrape.go @@ -36,7 +36,7 @@ type review struct { address string url string features map[string]float64 - weight float64 + count int64 latitude float64 longitude float64 @@ -46,10 +46,8 @@ type review struct { } type scraper interface { - index(doc *goquery.Document) (string, []string) - review(doc *goquery.Document) (string, string, map[string]float64, float64, error) - decode(address string) (float64, float64, error) - load(url string) (*goquery.Document, error) + index(doc *goquery.Document) (string, []string, error) + review(doc *goquery.Document) (string, string, map[string]float64, int64, error) } func makeAbsUrl(ref, base string) (string, error) { @@ -66,10 +64,10 @@ func makeAbsUrl(ref, base string) (string, error) { return b.ResolveReference(r).String(), nil } -func decodeReviews(in chan review, out chan review, scr scraper) { +func decodeReviews(in chan review, out chan review, scr scraper, gc *geoCache) { for rev := range in { if rev.err == nil { - rev.latitude, rev.longitude, rev.err = scr.decode(rev.address) + rev.latitude, rev.longitude, rev.err = gc.decode(rev.address) } out <- rev @@ -78,7 +76,7 @@ func decodeReviews(in chan review, out chan review, scr scraper) { close(out) } -func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) { +func scrapeReview(url string, out chan review, scr scraper, wc *webCache, group *sync.WaitGroup) { defer group.Done() var ( @@ -86,14 +84,14 @@ func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGrou rev = review{url: url, scr: scr} ) - if doc, rev.err = scr.load(rev.url); rev.err == nil { - rev.name, rev.address, rev.features, rev.weight, rev.err = scr.review(doc) + if doc, rev.err = wc.load(rev.url); rev.err == nil { + rev.name, rev.address, rev.features, rev.count, rev.err = scr.review(doc) } out <- rev } -func scrapeIndex(indexUrl string, out chan review, scr scraper) error { +func scrapeIndex(indexUrl string, out chan review, scr scraper, wc *webCache) error { var group sync.WaitGroup defer func() { @@ -102,12 +100,12 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error { }() for { - doc, err := scr.load(indexUrl) + doc, err := wc.load(indexUrl) if err != nil { return err } - nextIndexUrl, reviewUrls := scr.index(doc) + nextIndexUrl, reviewUrls, err := scr.index(doc) if err != nil { return err } @@ -119,7 +117,7 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error { } group.Add(1) - go scrapeReview(absUrl, out, scr, &group) + go scrapeReview(absUrl, out, scr, wc, &group) } if err != nil { @@ -139,7 +137,7 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error { return nil } -func scrape(url string, scr scraper) ([]review, error) { +func scrape(url string, scr scraper, gc *geoCache, wc *webCache) ([]review, error) { out := make(chan review, 128) in := make(chan review, 128) @@ -163,8 +161,8 @@ func scrape(url string, scr scraper) ([]review, error) { } }() - go decodeReviews(in, out, scr) - err := scrapeIndex(url, in, scr) + go decodeReviews(in, out, scr, gc) + err := scrapeIndex(url, in, scr, wc) return reviews, err } diff --git a/build/tabelog.go b/build/tabelog.go index 988967f..5ee6455 100644 --- a/build/tabelog.go +++ b/build/tabelog.go @@ -32,7 +32,6 @@ import ( ) type tabelog struct { - scrapeCtx } func (tabelog) define(keyword string) semantics { diff --git a/build/tripadvisor.go b/build/tripadvisor.go index 69f3e36..3e821ad 100644 --- a/build/tripadvisor.go +++ b/build/tripadvisor.go @@ -32,7 +32,6 @@ import ( ) type tripadvisor struct { - scrapeCtx } func (tripadvisor) define(keyword string) semantics {