2015-08-23 08:52:21 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
|
|
|
|
* Author: Alex Yatskov <alex@foosoft.net>
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
|
|
* this software and associated documentation files (the "Software"), to deal in
|
|
|
|
* the Software without restriction, including without limitation the rights to
|
|
|
|
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
|
|
* the Software, and to permit persons to whom the Software is furnished to do so,
|
|
|
|
* subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in all
|
|
|
|
* copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
|
|
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
|
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
|
|
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
2015-08-23 09:02:46 +00:00
|
|
|
"flag"
|
|
|
|
"log"
|
2015-08-23 08:52:21 +00:00
|
|
|
"os"
|
2015-09-22 02:06:20 +00:00
|
|
|
"path/filepath"
|
2015-08-23 08:52:21 +00:00
|
|
|
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
|
|
)
|
|
|
|
|
2015-09-18 06:41:15 +00:00
|
|
|
type restaurant struct {
|
2015-09-18 09:41:01 +00:00
|
|
|
name string
|
2015-09-20 03:08:44 +00:00
|
|
|
address string
|
2015-09-18 09:41:01 +00:00
|
|
|
reviews []review
|
|
|
|
sem semantics
|
2015-09-18 07:33:47 +00:00
|
|
|
|
|
|
|
latitude float64
|
|
|
|
longitude float64
|
|
|
|
|
2015-09-18 06:41:15 +00:00
|
|
|
closestStnName string
|
|
|
|
closestStnDist float64
|
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
func loadConverters(directory string) ([]*converter, error) {
|
|
|
|
matches, err := filepath.Glob(filepath.Join(directory, "*.toml"))
|
2015-08-23 08:52:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2015-09-17 06:54:32 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
var convs []*converter
|
|
|
|
for _, match := range matches {
|
|
|
|
conv, err := newConverter(match)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
convs = append(convs, conv)
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
return convs, nil
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
func loadUrls(filename string) ([]string, error) {
|
|
|
|
file, err := os.Open(filename)
|
2015-09-18 06:41:15 +00:00
|
|
|
if err != nil {
|
2015-09-22 02:06:20 +00:00
|
|
|
return nil, err
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
2015-09-22 02:06:20 +00:00
|
|
|
defer file.Close()
|
2015-09-18 06:41:15 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
var urls []string
|
|
|
|
for scanner := bufio.NewScanner(file); scanner.Scan(); {
|
|
|
|
if url := scanner.Text(); len(url) > 0 {
|
|
|
|
urls = append(urls, url)
|
|
|
|
}
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
return urls, nil
|
2015-09-18 06:41:15 +00:00
|
|
|
}
|
2015-09-17 06:54:32 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *webCache) ([]review, error) {
|
|
|
|
var reviews []review
|
2015-09-18 09:41:01 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
for _, u := range urls {
|
|
|
|
for _, c := range converters {
|
|
|
|
if !c.compatible(u) {
|
2015-09-18 09:41:01 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
revs, err := scrape(u, c, gc, wc)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2015-09-18 09:41:01 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
reviews = append(reviews, revs...)
|
|
|
|
break
|
2015-09-18 09:41:01 +00:00
|
|
|
}
|
|
|
|
}
|
2015-09-22 02:06:20 +00:00
|
|
|
|
|
|
|
return reviews, nil
|
2015-09-18 09:19:39 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
func main() {
|
|
|
|
var (
|
|
|
|
dbPath = flag.String("db", "data/db.sqlite3", "database output path")
|
|
|
|
urlsPath = flag.String("urls", "data/urls.txt", "index URLs to scrape")
|
|
|
|
convertersPath = flag.String("converters", "data/converters", "directory for converters")
|
|
|
|
stationsPath = flag.String("stations", "data/stations.json", "station geolocation data")
|
|
|
|
geocachePath = flag.String("geocache", "cache/geocache.json", "geolocation data cache")
|
|
|
|
webcachePath = flag.String("webcache", "cache/webcache", "web data cache")
|
|
|
|
)
|
2015-09-18 07:33:47 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
flag.Parse()
|
2015-09-18 07:33:47 +00:00
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("loading geocache from %s...", *geocachePath)
|
2015-09-22 02:06:20 +00:00
|
|
|
gc, err := newGeoCache(*geocachePath)
|
2015-09-18 07:33:47 +00:00
|
|
|
if err != nil {
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Fatal(err)
|
2015-09-18 07:33:47 +00:00
|
|
|
}
|
2015-09-22 02:06:20 +00:00
|
|
|
defer gc.save()
|
2015-09-18 07:33:47 +00:00
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("loading webcache from %s...", *webcachePath)
|
2015-09-22 02:06:20 +00:00
|
|
|
wc, err := newWebCache(*webcachePath)
|
2015-09-18 07:33:47 +00:00
|
|
|
if err != nil {
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Fatal(err)
|
2015-09-18 07:33:47 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("loading urls from %s...", *urlsPath)
|
2015-09-22 02:06:20 +00:00
|
|
|
urls, err := loadUrls(*urlsPath)
|
2015-09-18 07:33:47 +00:00
|
|
|
if err != nil {
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Fatal(err)
|
2015-09-18 07:33:47 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("loading converters from %s...", *convertersPath)
|
2015-09-22 02:06:20 +00:00
|
|
|
converters, err := loadConverters(*convertersPath)
|
2015-09-18 07:33:47 +00:00
|
|
|
if err != nil {
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Fatal(err)
|
2015-09-18 07:33:47 +00:00
|
|
|
}
|
2015-09-22 02:57:07 +00:00
|
|
|
for _, c := range converters {
|
|
|
|
log.Printf("*\t%s", c.Name)
|
|
|
|
}
|
2015-09-18 07:33:47 +00:00
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Print("scraping reviews...")
|
|
|
|
reviews, err := scrapeReviews(urls, converters, gc, wc)
|
2015-08-23 08:52:21 +00:00
|
|
|
if err != nil {
|
2015-08-23 09:02:46 +00:00
|
|
|
log.Fatal(err)
|
2015-08-23 08:52:21 +00:00
|
|
|
}
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Print("collating data...")
|
2015-09-18 06:41:15 +00:00
|
|
|
restaurants := collateData(reviews)
|
|
|
|
|
2015-09-22 02:06:20 +00:00
|
|
|
log.Print("computing data semantics..")
|
2015-09-18 09:19:39 +00:00
|
|
|
computeSemantics(restaurants)
|
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("computing station data from %s...", *stationsPath)
|
2015-09-18 09:19:39 +00:00
|
|
|
if err := computeStations(restaurants, *stationsPath); err != nil {
|
2015-09-18 06:41:15 +00:00
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2015-08-23 08:52:21 +00:00
|
|
|
|
2015-09-22 02:57:07 +00:00
|
|
|
log.Printf("saving data to %s...", *dbPath)
|
2015-09-18 07:33:47 +00:00
|
|
|
if err := dumpData(*dbPath, restaurants); err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2015-08-23 08:52:21 +00:00
|
|
|
}
|