From 37161c7206820acc9a6b47777eaf9812d37de6e5 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 23 Aug 2015 17:52:21 +0900 Subject: [PATCH] Updates --- build/.gitignore | 2 +- build/build.go | 226 ++++++++++++++++++++++++++++++++++ build/scrape.go | 311 ++++++++++++++++++----------------------------- build/scraper.go | 157 ------------------------ server.go | 2 +- 5 files changed, 349 insertions(+), 349 deletions(-) create mode 100644 build/build.go delete mode 100644 build/scraper.go diff --git a/build/.gitignore b/build/.gitignore index e171a20..378eac2 100644 --- a/build/.gitignore +++ b/build/.gitignore @@ -1 +1 @@ -scrape +build diff --git a/build/build.go b/build/build.go new file mode 100644 index 0000000..afacf58 --- /dev/null +++ b/build/build.go @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2015 Alex Yatskov + * Author: Alex Yatskov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import ( + "bufio" + "database/sql" + "errors" + "net/url" + "os" + + _ "github.com/mattn/go-sqlite3" +) + +func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + var results []restaurant + var scanner = bufio.NewScanner(file) + + for scanner.Scan() { + if line := scanner.Text(); len(line) > 0 { + parsed, err := url.Parse(line) + if err != nil { + return nil, err + } + + var items []restaurant + switch parsed.Host { + case "tabelog.com": + items = scrape(line, wc, gc, tabelog{}) + case "www.tripadvisor.com": + items = scrape(line, wc, gc, tripadvisor{}) + default: + return nil, errors.New("unsupported review site") + } + + results = append(results, items...) + } + } + + return results, nil +} + +func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) { + gc, err := newGeoCache(geocachePath) + if err != nil { + return nil, err + } + defer gc.save() + + wc, err := newWebCache(webcachePath) + if err != nil { + return nil, err + } + + restaurants, err := scrapeDataUrls(urlsPath, wc, gc) + if err != nil { + return nil, err + } + + return restaurants, nil +} + +func computeStnData(restaurants []restaurant, stationsPath string) error { + sq, err := newStationQuery(stationsPath) + if err != nil { + return err + } + + for i, _ := range restaurants { + r := &restaurants[i] + r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude) + } + + return nil +} + +func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) { + return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"] +} + +func dumpData(dbPath string, restaraunts []restaurant) error { + db, err := sql.Open("sqlite3", dbPath) + if err != nil { + return err + } + defer db.Close() + + _, err = db.Exec(` + DROP TABLE IF EXISTS reviews; + CREATE TABLE reviews( + name VARCHAR(100) NOT NULL, + url VARCHAR(200) NOT NULL, + delicious FLOAT NOT NULL, + accommodating FLOAT NOT NULL, + affordable FLOAT NOT NULL, + atmospheric FLOAT NOT NULL, + latitude FLOAT NOT NULL, + longitude FLOAT NOT NULL, + closestStnDist FLOAT NOT NULL, + closestStnName VARCHAR(100) NOT NULL, + accessCount INTEGER NOT NULL, + id INTEGER PRIMARY KEY + )`) + + if err != nil { + return err + } + + for _, r := range restaraunts { + delicious, accommodating, affordable, atmospheric := buildFeatures(r) + + _, err = db.Exec(` + INSERT INTO reviews( + name, + url, + delicious, + accommodating, + affordable, + atmospheric, + latitude, + longitude, + closestStnDist, + closestStnName, + accessCount + ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + r.name, + r.url, + delicious, + accommodating, + affordable, + atmospheric, + r.longitude, + r.latitude, + r.closestStnDist, + r.closestStnName, + 0) + + if err != nil { + return err + } + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS categories; + CREATE TABLE categories( + description VARCHAR(200) NOT NULL, + id INTEGER PRIMARY KEY)`) + + if err != nil { + return err + } + + for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { + if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { + return err + } + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS history; + CREATE TABLE history( + date DATETIME NOT NULL, + reviewId INTEGER NOT NULL, + id INTEGER PRIMARY KEY, + FOREIGN KEY(reviewId) REFERENCES reviews(id))`) + + if err != nil { + return err + } + + _, err = db.Exec(` + DROP TABLE IF EXISTS historyGroups; + CREATE TABLE historyGroups( + categoryId INTEGER NOT NULL, + categoryValue FLOAT NOT NULL, + historyId INTEGER NOT NULL, + FOREIGN KEY(historyId) REFERENCES history(id), + FOREIGN KEY(categoryId) REFERENCES categories(id))`) + + if err != nil { + return err + } + + return nil +} + +func main() { + restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache") + if err != nil { + panic(err) + } + + if err := computeStnData(restaurants, "data/stations.json"); err != nil { + panic(err) + } + + if err := dumpData("data/db.sqlite3", restaurants); err != nil { + panic(err) + } +} diff --git a/build/scrape.go b/build/scrape.go index afacf58..23a9c80 100644 --- a/build/scrape.go +++ b/build/scrape.go @@ -23,204 +23,135 @@ package main import ( - "bufio" - "database/sql" - "errors" + "log" "net/url" - "os" + "sync" - _ "github.com/mattn/go-sqlite3" + "github.com/PuerkitoBio/goquery" ) -func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) { - file, err := os.Open(filename) +type restaurant struct { + name string + address string + url string + + features map[string]float64 + + latitude float64 + longitude float64 + + closestStnName string + closestStnDist float64 +} + +type scraper interface { + index(doc *goquery.Document) (string, []string) + review(doc *goquery.Document) (string, string, map[string]float64, error) +} + +func makeAbsUrl(ref, base string) (string, error) { + b, err := url.Parse(base) if err != nil { - return nil, err + return "", err } - defer file.Close() + + r, err := url.Parse(ref) + if err != nil { + return "", err + } + + return b.ResolveReference(r).String(), nil +} + +func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) { + for { + if res, ok := <-in; ok { + pos, err := gc.decode(res.address) + if err == nil { + res.latitude = pos.Latitude + res.longitude = pos.Longitude + out <- res + } else { + log.Printf("failed to decode address for %s (%v)", res.url, err) + } + } else { + close(out) + return + } + } +} + +func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) { + defer group.Done() + + doc, err := wc.load(url) + if err != nil { + log.Printf("failed to load review at %s (%v)", url, err) + return + } + + name, address, features, err := scr.review(doc) + if err != nil { + log.Printf("failed to scrape review at %s (%v)", url, err) + return + } + + out <- restaurant{ + name: name, + address: address, + features: features, + url: url} +} + +func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { + doc, err := wc.load(indexUrl) + if err != nil { + log.Printf("failed to load index at %s (%v)", indexUrl, err) + return + } + + nextIndexUrl, reviewUrls := scr.index(doc) + if err != nil { + log.Fatal(err) + } + + var group sync.WaitGroup + for _, reviewUrl := range reviewUrls { + absUrl, err := makeAbsUrl(reviewUrl, indexUrl) + if err != nil { + log.Fatal(err) + } + + group.Add(1) + go scrapeReview(absUrl, out, wc, &group, scr) + } + group.Wait() + + if nextIndexUrl == "" { + close(out) + } else { + absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl) + if err != nil { + log.Fatal(err) + } + + scrapeIndex(absUrl, out, wc, scr) + } +} + +func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant { + out := make(chan restaurant, 128) + in := make(chan restaurant, 128) + + go scrapeIndex(url, in, wc, scr) + go decodeReviews(in, out, gc) var results []restaurant - var scanner = bufio.NewScanner(file) - - for scanner.Scan() { - if line := scanner.Text(); len(line) > 0 { - parsed, err := url.Parse(line) - if err != nil { - return nil, err - } - - var items []restaurant - switch parsed.Host { - case "tabelog.com": - items = scrape(line, wc, gc, tabelog{}) - case "www.tripadvisor.com": - items = scrape(line, wc, gc, tripadvisor{}) - default: - return nil, errors.New("unsupported review site") - } - - results = append(results, items...) + for { + if res, ok := <-out; ok { + results = append(results, res) + } else { + return results } } - - return results, nil -} - -func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) { - gc, err := newGeoCache(geocachePath) - if err != nil { - return nil, err - } - defer gc.save() - - wc, err := newWebCache(webcachePath) - if err != nil { - return nil, err - } - - restaurants, err := scrapeDataUrls(urlsPath, wc, gc) - if err != nil { - return nil, err - } - - return restaurants, nil -} - -func computeStnData(restaurants []restaurant, stationsPath string) error { - sq, err := newStationQuery(stationsPath) - if err != nil { - return err - } - - for i, _ := range restaurants { - r := &restaurants[i] - r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude) - } - - return nil -} - -func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) { - return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"] -} - -func dumpData(dbPath string, restaraunts []restaurant) error { - db, err := sql.Open("sqlite3", dbPath) - if err != nil { - return err - } - defer db.Close() - - _, err = db.Exec(` - DROP TABLE IF EXISTS reviews; - CREATE TABLE reviews( - name VARCHAR(100) NOT NULL, - url VARCHAR(200) NOT NULL, - delicious FLOAT NOT NULL, - accommodating FLOAT NOT NULL, - affordable FLOAT NOT NULL, - atmospheric FLOAT NOT NULL, - latitude FLOAT NOT NULL, - longitude FLOAT NOT NULL, - closestStnDist FLOAT NOT NULL, - closestStnName VARCHAR(100) NOT NULL, - accessCount INTEGER NOT NULL, - id INTEGER PRIMARY KEY - )`) - - if err != nil { - return err - } - - for _, r := range restaraunts { - delicious, accommodating, affordable, atmospheric := buildFeatures(r) - - _, err = db.Exec(` - INSERT INTO reviews( - name, - url, - delicious, - accommodating, - affordable, - atmospheric, - latitude, - longitude, - closestStnDist, - closestStnName, - accessCount - ) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - r.name, - r.url, - delicious, - accommodating, - affordable, - atmospheric, - r.longitude, - r.latitude, - r.closestStnDist, - r.closestStnName, - 0) - - if err != nil { - return err - } - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS categories; - CREATE TABLE categories( - description VARCHAR(200) NOT NULL, - id INTEGER PRIMARY KEY)`) - - if err != nil { - return err - } - - for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} { - if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil { - return err - } - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS history; - CREATE TABLE history( - date DATETIME NOT NULL, - reviewId INTEGER NOT NULL, - id INTEGER PRIMARY KEY, - FOREIGN KEY(reviewId) REFERENCES reviews(id))`) - - if err != nil { - return err - } - - _, err = db.Exec(` - DROP TABLE IF EXISTS historyGroups; - CREATE TABLE historyGroups( - categoryId INTEGER NOT NULL, - categoryValue FLOAT NOT NULL, - historyId INTEGER NOT NULL, - FOREIGN KEY(historyId) REFERENCES history(id), - FOREIGN KEY(categoryId) REFERENCES categories(id))`) - - if err != nil { - return err - } - - return nil -} - -func main() { - restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache") - if err != nil { - panic(err) - } - - if err := computeStnData(restaurants, "data/stations.json"); err != nil { - panic(err) - } - - if err := dumpData("data/db.sqlite3", restaurants); err != nil { - panic(err) - } } diff --git a/build/scraper.go b/build/scraper.go deleted file mode 100644 index 23a9c80..0000000 --- a/build/scraper.go +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2015 Alex Yatskov - * Author: Alex Yatskov - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of - * the Software, and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package main - -import ( - "log" - "net/url" - "sync" - - "github.com/PuerkitoBio/goquery" -) - -type restaurant struct { - name string - address string - url string - - features map[string]float64 - - latitude float64 - longitude float64 - - closestStnName string - closestStnDist float64 -} - -type scraper interface { - index(doc *goquery.Document) (string, []string) - review(doc *goquery.Document) (string, string, map[string]float64, error) -} - -func makeAbsUrl(ref, base string) (string, error) { - b, err := url.Parse(base) - if err != nil { - return "", err - } - - r, err := url.Parse(ref) - if err != nil { - return "", err - } - - return b.ResolveReference(r).String(), nil -} - -func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) { - for { - if res, ok := <-in; ok { - pos, err := gc.decode(res.address) - if err == nil { - res.latitude = pos.Latitude - res.longitude = pos.Longitude - out <- res - } else { - log.Printf("failed to decode address for %s (%v)", res.url, err) - } - } else { - close(out) - return - } - } -} - -func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) { - defer group.Done() - - doc, err := wc.load(url) - if err != nil { - log.Printf("failed to load review at %s (%v)", url, err) - return - } - - name, address, features, err := scr.review(doc) - if err != nil { - log.Printf("failed to scrape review at %s (%v)", url, err) - return - } - - out <- restaurant{ - name: name, - address: address, - features: features, - url: url} -} - -func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) { - doc, err := wc.load(indexUrl) - if err != nil { - log.Printf("failed to load index at %s (%v)", indexUrl, err) - return - } - - nextIndexUrl, reviewUrls := scr.index(doc) - if err != nil { - log.Fatal(err) - } - - var group sync.WaitGroup - for _, reviewUrl := range reviewUrls { - absUrl, err := makeAbsUrl(reviewUrl, indexUrl) - if err != nil { - log.Fatal(err) - } - - group.Add(1) - go scrapeReview(absUrl, out, wc, &group, scr) - } - group.Wait() - - if nextIndexUrl == "" { - close(out) - } else { - absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl) - if err != nil { - log.Fatal(err) - } - - scrapeIndex(absUrl, out, wc, scr) - } -} - -func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant { - out := make(chan restaurant, 128) - in := make(chan restaurant, 128) - - go scrapeIndex(url, in, wc, scr) - go decodeReviews(in, out, gc) - - var results []restaurant - for { - if res, ok := <-out; ok { - results = append(results, res) - } else { - return results - } - } -} diff --git a/server.go b/server.go index a9a4ecf..3f4fb19 100644 --- a/server.go +++ b/server.go @@ -312,7 +312,7 @@ func main() { staticDir := flag.String("static", "static", "path to static files") portNum := flag.Int("port", 8080, "port to serve content on") - dataSrc := flag.String("data", "db.sqlite3", "data source for database") + dataSrc := flag.String("data", "build/data/db.sqlite3", "data source for database") profile := flag.String("profile", "", "write cpu profile to file") flag.Parse()