1
This commit is contained in:
Alex Yatskov 2015-09-22 11:06:20 +09:00
parent 5a4fe4e8ea
commit 4cce523103
6 changed files with 322 additions and 262 deletions

View File

@ -24,34 +24,14 @@ package main
import ( import (
"bufio" "bufio"
"bytes"
"database/sql"
"encoding/binary"
"errors"
"flag" "flag"
"hash/fnv"
"log" "log"
"net/url"
"os" "os"
"path/filepath"
"github.com/PuerkitoBio/goquery"
"github.com/fatih/color"
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"
) )
type scrapeCtx struct {
gc *geoCache
wc *webCache
}
func (s scrapeCtx) decode(address string) (float64, float64, error) {
return s.gc.decode(address)
}
func (s scrapeCtx) load(url string) (*goquery.Document, error) {
return s.wc.load(url)
}
type restaurant struct { type restaurant struct {
name string name string
address string address string
@ -65,254 +45,119 @@ type restaurant struct {
closestStnDist float64 closestStnDist float64
} }
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]review, error) { func loadConverters(directory string) ([]*converter, error) {
gc, err := newGeoCache(geocachePath) matches, err := filepath.Glob(filepath.Join(directory, "*.toml"))
if err != nil {
return nil, err
}
defer gc.save()
wc, err := newWebCache(webcachePath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
file, err := os.Open(urlsPath) var convs []*converter
for _, match := range matches {
conv, err := newConverter(match)
if err != nil {
return nil, err
}
convs = append(convs, conv)
}
return convs, nil
}
func loadUrls(filename string) ([]string, error) {
file, err := os.Open(filename)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer file.Close() defer file.Close()
ctx := scrapeCtx{gc, wc} var urls []string
tlog := tabelog{scrapeCtx: ctx}
tadv := tripadvisor{scrapeCtx: ctx}
var reviews []review
for scanner := bufio.NewScanner(file); scanner.Scan(); { for scanner := bufio.NewScanner(file); scanner.Scan(); {
if line := scanner.Text(); len(line) > 0 { if url := scanner.Text(); len(url) > 0 {
parsed, err := url.Parse(line) urls = append(urls, url)
if err != nil { }
return nil, err
} }
var revs []review return urls, nil
switch parsed.Host { }
case "tabelog.com":
revs, err = scrape(line, tlog) func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *webCache) ([]review, error) {
case "www.tripadvisor.com": var reviews []review
revs, err = scrape(line, tadv)
default: for _, u := range urls {
err = errors.New("unsupported review site") for _, c := range converters {
if !c.compatible(u) {
continue
} }
revs, err := scrape(u, c, gc, wc)
if err != nil { if err != nil {
return nil, err return nil, err
} }
reviews = append(reviews, revs...) reviews = append(reviews, revs...)
break
} }
} }
return reviews, nil return reviews, nil
} }
func collateData(reviews []review) map[uint64]*restaurant { func main() {
restaurants := make(map[uint64]*restaurant)
for _, rev := range reviews {
var buff bytes.Buffer
binary.Write(&buff, binary.LittleEndian, rev.latitude)
binary.Write(&buff, binary.LittleEndian, rev.longitude)
binary.Write(&buff, binary.LittleEndian, rev.name)
hash := fnv.New64()
hash.Write(buff.Bytes())
var rest *restaurant
if rest, _ = restaurants[hash.Sum64()]; rest == nil {
rest = &restaurant{name: rev.name, address: rev.address, latitude: rev.latitude, longitude: rev.longitude}
restaurants[hash.Sum64()] = rest
}
rest.reviews = append(rest.reviews, rev)
}
return restaurants
}
func computeStations(restaurants map[uint64]*restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
for _, rest := range restaurants {
rest.closestStnName, rest.closestStnDist = sq.closestStation(rest.latitude, rest.longitude)
}
return nil
}
func computeSemantics(restaraunts map[uint64]*restaurant) {
type definer interface {
define(keyword string) semantics
}
for _, rest := range restaraunts {
var ( var (
sem semantics dbPath = flag.String("db", "data/db.sqlite3", "database output path")
weight float64 urlsPath = flag.String("urls", "data/urls.txt", "index URLs to scrape")
convertersPath = flag.String("converters", "data/converters", "directory for converters")
stationsPath = flag.String("stations", "data/stations.json", "station geolocation data")
geocachePath = flag.String("geocache", "cache/geocache.json", "geolocation data cache")
webcachePath = flag.String("webcache", "cache/webcache", "web data cache")
) )
for _, rev := range rest.reviews {
def, ok := rev.scr.(definer)
if !ok {
continue
}
for name, value := range rev.features {
sem = sem.combine(def.define(name), rev.weight*value)
}
weight += rev.weight
}
if weight > 0.0 {
rest.sem = sem.reduce(weight)
}
}
}
func dumpData(dbPath string, restaraunts map[uint64]*restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
address VARCHAR(400) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
if err != nil {
return err
}
for _, rest := range restaraunts {
_, err = db.Exec(`
INSERT INTO reviews(
name,
address,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
rest.name,
rest.address,
rest.sem.Delicious,
rest.sem.Accomodating,
rest.sem.Affordable,
rest.sem.Atmospheric,
rest.latitude,
rest.longitude,
rest.closestStnDist,
rest.closestStnName,
0)
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}
func main() {
dbPath := flag.String("db", "data/db.sqlite3", "database output path")
urlsPath := flag.String("urls", "data/urls.txt", "index URLs to scrape")
stationsPath := flag.String("stations", "data/stations.json", "station geolocation data")
geocachePath := flag.String("geocache", "cache/geocache.json", "geolocation data cache")
webcachePath := flag.String("webcache", "cache/webcache", "web data cache")
flag.Parse() flag.Parse()
log.Print(color.BlueString("scraping data...")) log.Printf("loading geocache %s...", *geocachePath)
reviews, err := scrapeData(*urlsPath, *geocachePath, *webcachePath) gc, err := newGeoCache(*geocachePath)
if err != nil {
log.Fatal(err)
}
defer gc.save()
log.Printf("loading webcache %s...", *webcachePath)
wc, err := newWebCache(*webcachePath)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
log.Print(color.BlueString("collating data...")) log.Printf("loading urls %s...", *urlsPath)
urls, err := loadUrls(*urlsPath)
if err != nil {
log.Fatal(err)
}
log.Printf("loading converters %s...", *convertersPath)
converters, err := loadConverters(*convertersPath)
if err != nil {
log.Fatal(err)
}
log.Print("scraping reviews...")
reviews, err := scrapeReviews(urls, converters, gc, wc)
if err != nil {
log.Fatal(err)
}
log.Print("collating data...")
restaurants := collateData(reviews) restaurants := collateData(reviews)
log.Print(color.BlueString("computing data semantics..")) log.Print("computing data semantics..")
computeSemantics(restaurants) computeSemantics(restaurants)
log.Print(color.BlueString("computing station data...")) log.Print("computing station data...")
if err := computeStations(restaurants, *stationsPath); err != nil { if err := computeStations(restaurants, *stationsPath); err != nil {
log.Fatal(err) log.Fatal(err)
} }
log.Print(color.BlueString("saving data...")) log.Print("saving data...")
if err := dumpData(*dbPath, restaurants); err != nil { if err := dumpData(*dbPath, restaurants); err != nil {
log.Fatal(err) log.Fatal(err)
} }

View File

@ -26,6 +26,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"net/url"
"regexp" "regexp"
"strconv" "strconv"
"strings" "strings"
@ -63,9 +64,9 @@ func (s semantics) reduce(weight float64) semantics {
} }
// //
// locator // selector
// //
type locator struct { type selector struct {
Path string Path string
Attr string Attr string
RegEx string RegEx string
@ -73,7 +74,7 @@ type locator struct {
regExComp *regexp.Regexp regExComp *regexp.Regexp
} }
func (l *locator) locateStrings(doc *goquery.Document) ([]string, error) { func (l *selector) locateStrings(doc *goquery.Document) ([]string, error) {
var err error var err error
if len(l.RegEx) > 0 && l.regExComp == nil { if len(l.RegEx) > 0 && l.regExComp == nil {
if l.regExComp, err = regexp.Compile(l.RegEx); err != nil { if l.regExComp, err = regexp.Compile(l.RegEx); err != nil {
@ -105,7 +106,7 @@ func (l *locator) locateStrings(doc *goquery.Document) ([]string, error) {
} }
func (l *locator) locateString(doc *goquery.Document) (string, error) { func (l *selector) locateString(doc *goquery.Document) (string, error) {
strs, err := l.locateStrings(doc) strs, err := l.locateStrings(doc)
if err != nil { if err != nil {
return "", err return "", err
@ -114,7 +115,7 @@ func (l *locator) locateString(doc *goquery.Document) (string, error) {
return strings.Join(strs, " "), nil return strings.Join(strs, " "), nil
} }
func (l *locator) locateInt(doc *goquery.Document) (int64, error) { func (l *selector) locateInt(doc *goquery.Document) (int64, error) {
str, err := l.locateString(doc) str, err := l.locateString(doc)
if err != nil { if err != nil {
return 0, err return 0, err
@ -123,7 +124,7 @@ func (l *locator) locateInt(doc *goquery.Document) (int64, error) {
return strconv.ParseInt(str, 10, 8) return strconv.ParseInt(str, 10, 8)
} }
func (l *locator) locateFloat(doc *goquery.Document) (float64, error) { func (l *selector) locateFloat(doc *goquery.Document) (float64, error) {
str, err := l.locateString(doc) str, err := l.locateString(doc)
if err != nil { if err != nil {
return 0.0, err return 0.0, err
@ -136,18 +137,20 @@ func (l *locator) locateFloat(doc *goquery.Document) (float64, error) {
// converter // converter
// //
type converter struct { type converter struct {
Domains []string
Index struct { Index struct {
Items locator Items selector
Next locator Next selector
} }
Item struct { Item struct {
Name locator Name selector
Address locator Address selector
Count locator Count selector
Props map[string]struct { Props map[string]struct {
semantics semantics
locator selector
Scale float64 Scale float64
} }
} }
@ -171,6 +174,21 @@ func (c converter) define(keyword string) semantics {
return c.Item.Props[keyword].semantics return c.Item.Props[keyword].semantics
} }
func (c converter) compatible(address string) bool {
parsed, err := url.Parse(address)
if err != nil {
return false
}
for _, d := range c.Domains {
if d == parsed.Host {
return true
}
}
return false
}
func (c converter) index(doc *goquery.Document) (next string, items []string, err error) { func (c converter) index(doc *goquery.Document) (next string, items []string, err error) {
if items, err = c.Index.Items.locateStrings(doc); err != nil { if items, err = c.Index.Items.locateStrings(doc); err != nil {
return return

201
build/db.go Normal file
View File

@ -0,0 +1,201 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bytes"
"database/sql"
"encoding/binary"
"hash/fnv"
)
func collateData(reviews []review) map[uint64]*restaurant {
restaurants := make(map[uint64]*restaurant)
for _, rev := range reviews {
var buff bytes.Buffer
binary.Write(&buff, binary.LittleEndian, rev.latitude)
binary.Write(&buff, binary.LittleEndian, rev.longitude)
binary.Write(&buff, binary.LittleEndian, rev.name)
hash := fnv.New64()
hash.Write(buff.Bytes())
var rest *restaurant
if rest, _ = restaurants[hash.Sum64()]; rest == nil {
rest = &restaurant{name: rev.name, address: rev.address, latitude: rev.latitude, longitude: rev.longitude}
restaurants[hash.Sum64()] = rest
}
rest.reviews = append(rest.reviews, rev)
}
return restaurants
}
func computeSemantics(restaraunts map[uint64]*restaurant) {
type definer interface {
define(keyword string) semantics
}
for _, rest := range restaraunts {
var (
sem semantics
count int64
)
for _, rev := range rest.reviews {
def, ok := rev.scr.(definer)
if !ok {
continue
}
for name, value := range rev.features {
sem = sem.combine(def.define(name), float64(rev.count)*value)
}
count += rev.count
}
if count > 0 {
rest.sem = sem.reduce(float64(count))
}
}
}
func computeStations(restaurants map[uint64]*restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
for _, rest := range restaurants {
rest.closestStnName, rest.closestStnDist = sq.closestStation(rest.latitude, rest.longitude)
}
return nil
}
func dumpData(dbPath string, restaraunts map[uint64]*restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
address VARCHAR(400) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
if err != nil {
return err
}
for _, rest := range restaraunts {
_, err = db.Exec(`
INSERT INTO reviews(
name,
address,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
rest.name,
rest.address,
rest.sem.Delicious,
rest.sem.Accomodating,
rest.sem.Affordable,
rest.sem.Atmospheric,
rest.latitude,
rest.longitude,
rest.closestStnDist,
rest.closestStnName,
0)
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}

View File

@ -36,7 +36,7 @@ type review struct {
address string address string
url string url string
features map[string]float64 features map[string]float64
weight float64 count int64
latitude float64 latitude float64
longitude float64 longitude float64
@ -46,10 +46,8 @@ type review struct {
} }
type scraper interface { type scraper interface {
index(doc *goquery.Document) (string, []string) index(doc *goquery.Document) (string, []string, error)
review(doc *goquery.Document) (string, string, map[string]float64, float64, error) review(doc *goquery.Document) (string, string, map[string]float64, int64, error)
decode(address string) (float64, float64, error)
load(url string) (*goquery.Document, error)
} }
func makeAbsUrl(ref, base string) (string, error) { func makeAbsUrl(ref, base string) (string, error) {
@ -66,10 +64,10 @@ func makeAbsUrl(ref, base string) (string, error) {
return b.ResolveReference(r).String(), nil return b.ResolveReference(r).String(), nil
} }
func decodeReviews(in chan review, out chan review, scr scraper) { func decodeReviews(in chan review, out chan review, scr scraper, gc *geoCache) {
for rev := range in { for rev := range in {
if rev.err == nil { if rev.err == nil {
rev.latitude, rev.longitude, rev.err = scr.decode(rev.address) rev.latitude, rev.longitude, rev.err = gc.decode(rev.address)
} }
out <- rev out <- rev
@ -78,7 +76,7 @@ func decodeReviews(in chan review, out chan review, scr scraper) {
close(out) close(out)
} }
func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGroup) { func scrapeReview(url string, out chan review, scr scraper, wc *webCache, group *sync.WaitGroup) {
defer group.Done() defer group.Done()
var ( var (
@ -86,14 +84,14 @@ func scrapeReview(url string, out chan review, scr scraper, group *sync.WaitGrou
rev = review{url: url, scr: scr} rev = review{url: url, scr: scr}
) )
if doc, rev.err = scr.load(rev.url); rev.err == nil { if doc, rev.err = wc.load(rev.url); rev.err == nil {
rev.name, rev.address, rev.features, rev.weight, rev.err = scr.review(doc) rev.name, rev.address, rev.features, rev.count, rev.err = scr.review(doc)
} }
out <- rev out <- rev
} }
func scrapeIndex(indexUrl string, out chan review, scr scraper) error { func scrapeIndex(indexUrl string, out chan review, scr scraper, wc *webCache) error {
var group sync.WaitGroup var group sync.WaitGroup
defer func() { defer func() {
@ -102,12 +100,12 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error {
}() }()
for { for {
doc, err := scr.load(indexUrl) doc, err := wc.load(indexUrl)
if err != nil { if err != nil {
return err return err
} }
nextIndexUrl, reviewUrls := scr.index(doc) nextIndexUrl, reviewUrls, err := scr.index(doc)
if err != nil { if err != nil {
return err return err
} }
@ -119,7 +117,7 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error {
} }
group.Add(1) group.Add(1)
go scrapeReview(absUrl, out, scr, &group) go scrapeReview(absUrl, out, scr, wc, &group)
} }
if err != nil { if err != nil {
@ -139,7 +137,7 @@ func scrapeIndex(indexUrl string, out chan review, scr scraper) error {
return nil return nil
} }
func scrape(url string, scr scraper) ([]review, error) { func scrape(url string, scr scraper, gc *geoCache, wc *webCache) ([]review, error) {
out := make(chan review, 128) out := make(chan review, 128)
in := make(chan review, 128) in := make(chan review, 128)
@ -163,8 +161,8 @@ func scrape(url string, scr scraper) ([]review, error) {
} }
}() }()
go decodeReviews(in, out, scr) go decodeReviews(in, out, scr, gc)
err := scrapeIndex(url, in, scr) err := scrapeIndex(url, in, scr, wc)
return reviews, err return reviews, err
} }

View File

@ -32,7 +32,6 @@ import (
) )
type tabelog struct { type tabelog struct {
scrapeCtx
} }
func (tabelog) define(keyword string) semantics { func (tabelog) define(keyword string) semantics {

View File

@ -32,7 +32,6 @@ import (
) )
type tripadvisor struct { type tripadvisor struct {
scrapeCtx
} }
func (tripadvisor) define(keyword string) semantics { func (tripadvisor) define(keyword string) semantics {