1
This commit is contained in:
Alex Yatskov 2015-08-23 17:52:21 +09:00
parent c4771359bd
commit 37161c7206
5 changed files with 349 additions and 349 deletions

2
build/.gitignore vendored
View File

@ -1 +1 @@
scrape
build

226
build/build.go Normal file
View File

@ -0,0 +1,226 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bufio"
"database/sql"
"errors"
"net/url"
"os"
_ "github.com/mattn/go-sqlite3"
)
func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
var results []restaurant
var scanner = bufio.NewScanner(file)
for scanner.Scan() {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
var items []restaurant
switch parsed.Host {
case "tabelog.com":
items = scrape(line, wc, gc, tabelog{})
case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{})
default:
return nil, errors.New("unsupported review site")
}
results = append(results, items...)
}
}
return results, nil
}
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
gc, err := newGeoCache(geocachePath)
if err != nil {
return nil, err
}
defer gc.save()
wc, err := newWebCache(webcachePath)
if err != nil {
return nil, err
}
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
if err != nil {
return nil, err
}
return restaurants, nil
}
func computeStnData(restaurants []restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
for i, _ := range restaurants {
r := &restaurants[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
}
return nil
}
func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) {
return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"]
}
func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
if err != nil {
return err
}
for _, r := range restaraunts {
delicious, accommodating, affordable, atmospheric := buildFeatures(r)
_, err = db.Exec(`
INSERT INTO reviews(
name,
url,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name,
r.url,
delicious,
accommodating,
affordable,
atmospheric,
r.longitude,
r.latitude,
r.closestStnDist,
r.closestStnName,
0)
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}
func main() {
restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache")
if err != nil {
panic(err)
}
if err := computeStnData(restaurants, "data/stations.json"); err != nil {
panic(err)
}
if err := dumpData("data/db.sqlite3", restaurants); err != nil {
panic(err)
}
}

View File

@ -23,204 +23,135 @@
package main
import (
"bufio"
"database/sql"
"errors"
"log"
"net/url"
"os"
"sync"
_ "github.com/mattn/go-sqlite3"
"github.com/PuerkitoBio/goquery"
)
func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) {
file, err := os.Open(filename)
type restaurant struct {
name string
address string
url string
features map[string]float64
latitude float64
longitude float64
closestStnName string
closestStnDist float64
}
type scraper interface {
index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]float64, error)
}
func makeAbsUrl(ref, base string) (string, error) {
b, err := url.Parse(base)
if err != nil {
return nil, err
return "", err
}
defer file.Close()
r, err := url.Parse(ref)
if err != nil {
return "", err
}
return b.ResolveReference(r).String(), nil
}
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
for {
if res, ok := <-in; ok {
pos, err := gc.decode(res.address)
if err == nil {
res.latitude = pos.Latitude
res.longitude = pos.Longitude
out <- res
} else {
log.Printf("failed to decode address for %s (%v)", res.url, err)
}
} else {
close(out)
return
}
}
}
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
defer group.Done()
doc, err := wc.load(url)
if err != nil {
log.Printf("failed to load review at %s (%v)", url, err)
return
}
name, address, features, err := scr.review(doc)
if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err)
return
}
out <- restaurant{
name: name,
address: address,
features: features,
url: url}
}
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
doc, err := wc.load(indexUrl)
if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err)
return
}
nextIndexUrl, reviewUrls := scr.index(doc)
if err != nil {
log.Fatal(err)
}
var group sync.WaitGroup
for _, reviewUrl := range reviewUrls {
absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
group.Add(1)
go scrapeReview(absUrl, out, wc, &group, scr)
}
group.Wait()
if nextIndexUrl == "" {
close(out)
} else {
absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
scrapeIndex(absUrl, out, wc, scr)
}
}
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant {
out := make(chan restaurant, 128)
in := make(chan restaurant, 128)
go scrapeIndex(url, in, wc, scr)
go decodeReviews(in, out, gc)
var results []restaurant
var scanner = bufio.NewScanner(file)
for scanner.Scan() {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
var items []restaurant
switch parsed.Host {
case "tabelog.com":
items = scrape(line, wc, gc, tabelog{})
case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{})
default:
return nil, errors.New("unsupported review site")
}
results = append(results, items...)
for {
if res, ok := <-out; ok {
results = append(results, res)
} else {
return results
}
}
return results, nil
}
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
gc, err := newGeoCache(geocachePath)
if err != nil {
return nil, err
}
defer gc.save()
wc, err := newWebCache(webcachePath)
if err != nil {
return nil, err
}
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
if err != nil {
return nil, err
}
return restaurants, nil
}
func computeStnData(restaurants []restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
for i, _ := range restaurants {
r := &restaurants[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
}
return nil
}
func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) {
return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"]
}
func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
if err != nil {
return err
}
for _, r := range restaraunts {
delicious, accommodating, affordable, atmospheric := buildFeatures(r)
_, err = db.Exec(`
INSERT INTO reviews(
name,
url,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name,
r.url,
delicious,
accommodating,
affordable,
atmospheric,
r.longitude,
r.latitude,
r.closestStnDist,
r.closestStnName,
0)
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}
func main() {
restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache")
if err != nil {
panic(err)
}
if err := computeStnData(restaurants, "data/stations.json"); err != nil {
panic(err)
}
if err := dumpData("data/db.sqlite3", restaurants); err != nil {
panic(err)
}
}

View File

@ -1,157 +0,0 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"log"
"net/url"
"sync"
"github.com/PuerkitoBio/goquery"
)
type restaurant struct {
name string
address string
url string
features map[string]float64
latitude float64
longitude float64
closestStnName string
closestStnDist float64
}
type scraper interface {
index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]float64, error)
}
func makeAbsUrl(ref, base string) (string, error) {
b, err := url.Parse(base)
if err != nil {
return "", err
}
r, err := url.Parse(ref)
if err != nil {
return "", err
}
return b.ResolveReference(r).String(), nil
}
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
for {
if res, ok := <-in; ok {
pos, err := gc.decode(res.address)
if err == nil {
res.latitude = pos.Latitude
res.longitude = pos.Longitude
out <- res
} else {
log.Printf("failed to decode address for %s (%v)", res.url, err)
}
} else {
close(out)
return
}
}
}
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
defer group.Done()
doc, err := wc.load(url)
if err != nil {
log.Printf("failed to load review at %s (%v)", url, err)
return
}
name, address, features, err := scr.review(doc)
if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err)
return
}
out <- restaurant{
name: name,
address: address,
features: features,
url: url}
}
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
doc, err := wc.load(indexUrl)
if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err)
return
}
nextIndexUrl, reviewUrls := scr.index(doc)
if err != nil {
log.Fatal(err)
}
var group sync.WaitGroup
for _, reviewUrl := range reviewUrls {
absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
group.Add(1)
go scrapeReview(absUrl, out, wc, &group, scr)
}
group.Wait()
if nextIndexUrl == "" {
close(out)
} else {
absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
scrapeIndex(absUrl, out, wc, scr)
}
}
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant {
out := make(chan restaurant, 128)
in := make(chan restaurant, 128)
go scrapeIndex(url, in, wc, scr)
go decodeReviews(in, out, gc)
var results []restaurant
for {
if res, ok := <-out; ok {
results = append(results, res)
} else {
return results
}
}
}

View File

@ -312,7 +312,7 @@ func main() {
staticDir := flag.String("static", "static", "path to static files")
portNum := flag.Int("port", 8080, "port to serve content on")
dataSrc := flag.String("data", "db.sqlite3", "data source for database")
dataSrc := flag.String("data", "build/data/db.sqlite3", "data source for database")
profile := flag.String("profile", "", "write cpu profile to file")
flag.Parse()