1

Add 'build/' from commit '9d3fe8f2ed0b5a5a293811563953b795231174ec'

git-subtree-dir: build
git-subtree-mainline: c63c6a835fdb7230e92cdd855c0850b56afb1dff
git-subtree-split: 9d3fe8f2ed0b5a5a293811563953b795231174ec
This commit is contained in:
Alex Yatskov 2015-08-23 17:47:09 +09:00
commit c4771359bd
13 changed files with 8446 additions and 0 deletions

1
build/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
scrape

1
build/cache/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
webcache

5674
build/cache/geocache.json vendored Normal file

File diff suppressed because it is too large Load Diff

BIN
build/data/db.sqlite3 Normal file

Binary file not shown.

1966
build/data/stations.json Normal file

File diff suppressed because it is too large Load Diff

16
build/data/urls.txt Normal file
View File

@ -0,0 +1,16 @@
http://tabelog.com/en/kanagawa/rstLst/
http://www.tripadvisor.com/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021282-Sagamihara_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g303156-Kamakura_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g298174-Yokosuka_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021278-Odawara_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g681222-Hiratsuka_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g298169-Atsugi_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021286-Yamato_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html
http://www.tripadvisor.com/Restaurants-g1021285-Hadano_Kanagawa_Prefecture_Kanto.html

97
build/geocache.go Normal file
View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"encoding/json"
"io/ioutil"
"os"
"time"
"github.com/kellydunn/golang-geo"
)
type geoPos struct {
Latitude float64
Longitude float64
}
type geoCache struct {
filename string
data map[string]geoPos
ticker *time.Ticker
coder geo.GoogleGeocoder
}
func newGeoCache(filename string) (*geoCache, error) {
cache := &geoCache{
filename: filename,
data: make(map[string]geoPos),
ticker: time.NewTicker(time.Millisecond * 200),
}
if err := cache.load(); err != nil {
return nil, err
}
return cache, nil
}
func (c *geoCache) load() error {
file, err := os.Open(c.filename)
if os.IsNotExist(err) {
return nil
}
if err != nil {
return err
}
defer file.Close()
return json.NewDecoder(file).Decode(&c.data)
}
func (c *geoCache) save() error {
js, err := json.MarshalIndent(c.data, "", " ")
if err != nil {
return err
}
return ioutil.WriteFile(c.filename, js, 0644)
}
func (c *geoCache) decode(address string) (geoPos, error) {
if pos, ok := c.data[address]; ok {
return pos, nil
}
<-c.ticker.C
point, err := c.coder.Geocode(address)
if err != nil {
return geoPos{}, err
}
pos := geoPos{point.Lat(), point.Lng()}
c.data[address] = pos
return pos, nil
}

226
build/scrape.go Normal file
View File

@ -0,0 +1,226 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bufio"
"database/sql"
"errors"
"net/url"
"os"
_ "github.com/mattn/go-sqlite3"
)
func scrapeDataUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
var results []restaurant
var scanner = bufio.NewScanner(file)
for scanner.Scan() {
if line := scanner.Text(); len(line) > 0 {
parsed, err := url.Parse(line)
if err != nil {
return nil, err
}
var items []restaurant
switch parsed.Host {
case "tabelog.com":
items = scrape(line, wc, gc, tabelog{})
case "www.tripadvisor.com":
items = scrape(line, wc, gc, tripadvisor{})
default:
return nil, errors.New("unsupported review site")
}
results = append(results, items...)
}
}
return results, nil
}
func scrapeData(urlsPath, geocachePath, webcachePath string) ([]restaurant, error) {
gc, err := newGeoCache(geocachePath)
if err != nil {
return nil, err
}
defer gc.save()
wc, err := newWebCache(webcachePath)
if err != nil {
return nil, err
}
restaurants, err := scrapeDataUrls(urlsPath, wc, gc)
if err != nil {
return nil, err
}
return restaurants, nil
}
func computeStnData(restaurants []restaurant, stationsPath string) error {
sq, err := newStationQuery(stationsPath)
if err != nil {
return err
}
for i, _ := range restaurants {
r := &restaurants[i]
r.closestStnName, r.closestStnDist = sq.closestStation(r.latitude, r.longitude)
}
return nil
}
func buildFeatures(r restaurant) (delicious, accommodating, affordable, atmospheric float64) {
return r.features["food"], r.features["service"], r.features["value"], r.features["atmosphere"]
}
func dumpData(dbPath string, restaraunts []restaurant) error {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return err
}
defer db.Close()
_, err = db.Exec(`
DROP TABLE IF EXISTS reviews;
CREATE TABLE reviews(
name VARCHAR(100) NOT NULL,
url VARCHAR(200) NOT NULL,
delicious FLOAT NOT NULL,
accommodating FLOAT NOT NULL,
affordable FLOAT NOT NULL,
atmospheric FLOAT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL,
closestStnDist FLOAT NOT NULL,
closestStnName VARCHAR(100) NOT NULL,
accessCount INTEGER NOT NULL,
id INTEGER PRIMARY KEY
)`)
if err != nil {
return err
}
for _, r := range restaraunts {
delicious, accommodating, affordable, atmospheric := buildFeatures(r)
_, err = db.Exec(`
INSERT INTO reviews(
name,
url,
delicious,
accommodating,
affordable,
atmospheric,
latitude,
longitude,
closestStnDist,
closestStnName,
accessCount
) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.name,
r.url,
delicious,
accommodating,
affordable,
atmospheric,
r.longitude,
r.latitude,
r.closestStnDist,
r.closestStnName,
0)
if err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS categories;
CREATE TABLE categories(
description VARCHAR(200) NOT NULL,
id INTEGER PRIMARY KEY)`)
if err != nil {
return err
}
for _, category := range []string{"I prefer quiet places", "I enjoy Mexican Food", "I drive a car"} {
if _, err := db.Exec("INSERT INTO categories(description) VALUES (?)", category); err != nil {
return err
}
}
_, err = db.Exec(`
DROP TABLE IF EXISTS history;
CREATE TABLE history(
date DATETIME NOT NULL,
reviewId INTEGER NOT NULL,
id INTEGER PRIMARY KEY,
FOREIGN KEY(reviewId) REFERENCES reviews(id))`)
if err != nil {
return err
}
_, err = db.Exec(`
DROP TABLE IF EXISTS historyGroups;
CREATE TABLE historyGroups(
categoryId INTEGER NOT NULL,
categoryValue FLOAT NOT NULL,
historyId INTEGER NOT NULL,
FOREIGN KEY(historyId) REFERENCES history(id),
FOREIGN KEY(categoryId) REFERENCES categories(id))`)
if err != nil {
return err
}
return nil
}
func main() {
restaurants, err := scrapeData("data/urls.txt", "cache/geocache.json", "cache/webcache")
if err != nil {
panic(err)
}
if err := computeStnData(restaurants, "data/stations.json"); err != nil {
panic(err)
}
if err := dumpData("data/db.sqlite3", restaurants); err != nil {
panic(err)
}
}

157
build/scraper.go Normal file
View File

@ -0,0 +1,157 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"log"
"net/url"
"sync"
"github.com/PuerkitoBio/goquery"
)
type restaurant struct {
name string
address string
url string
features map[string]float64
latitude float64
longitude float64
closestStnName string
closestStnDist float64
}
type scraper interface {
index(doc *goquery.Document) (string, []string)
review(doc *goquery.Document) (string, string, map[string]float64, error)
}
func makeAbsUrl(ref, base string) (string, error) {
b, err := url.Parse(base)
if err != nil {
return "", err
}
r, err := url.Parse(ref)
if err != nil {
return "", err
}
return b.ResolveReference(r).String(), nil
}
func decodeReviews(in chan restaurant, out chan restaurant, gc *geoCache) {
for {
if res, ok := <-in; ok {
pos, err := gc.decode(res.address)
if err == nil {
res.latitude = pos.Latitude
res.longitude = pos.Longitude
out <- res
} else {
log.Printf("failed to decode address for %s (%v)", res.url, err)
}
} else {
close(out)
return
}
}
}
func scrapeReview(url string, out chan restaurant, wc *webCache, group *sync.WaitGroup, scr scraper) {
defer group.Done()
doc, err := wc.load(url)
if err != nil {
log.Printf("failed to load review at %s (%v)", url, err)
return
}
name, address, features, err := scr.review(doc)
if err != nil {
log.Printf("failed to scrape review at %s (%v)", url, err)
return
}
out <- restaurant{
name: name,
address: address,
features: features,
url: url}
}
func scrapeIndex(indexUrl string, out chan restaurant, wc *webCache, scr scraper) {
doc, err := wc.load(indexUrl)
if err != nil {
log.Printf("failed to load index at %s (%v)", indexUrl, err)
return
}
nextIndexUrl, reviewUrls := scr.index(doc)
if err != nil {
log.Fatal(err)
}
var group sync.WaitGroup
for _, reviewUrl := range reviewUrls {
absUrl, err := makeAbsUrl(reviewUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
group.Add(1)
go scrapeReview(absUrl, out, wc, &group, scr)
}
group.Wait()
if nextIndexUrl == "" {
close(out)
} else {
absUrl, err := makeAbsUrl(nextIndexUrl, indexUrl)
if err != nil {
log.Fatal(err)
}
scrapeIndex(absUrl, out, wc, scr)
}
}
func scrape(url string, wc *webCache, gc *geoCache, scr scraper) []restaurant {
out := make(chan restaurant, 128)
in := make(chan restaurant, 128)
go scrapeIndex(url, in, wc, scr)
go decodeReviews(in, out, gc)
var results []restaurant
for {
if res, ok := <-out; ok {
results = append(results, res)
} else {
return results
}
}
}

73
build/stations.go Normal file
View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"encoding/json"
"math"
"os"
"github.com/kellydunn/golang-geo"
)
type station struct {
Latitude float64
Longitude float64
}
type stationQuery struct {
stations map[string]station
}
func newStationQuery(filename string) (*stationQuery, error) {
s := new(stationQuery)
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
if err := json.NewDecoder(file).Decode(&s.stations); err != nil {
return nil, err
}
return s, nil
}
func (s *stationQuery) closestStation(latitude, longitude float64) (name string, distance float64) {
queryPt := geo.NewPoint(latitude, longitude)
var closestStn string
minDist := math.MaxFloat64
for name, station := range s.stations {
stnPt := geo.NewPoint(station.Latitude, station.Longitude)
if currDist := queryPt.GreatCircleDistance(stnPt); currDist < minDist {
closestStn = name
minDist = currDist
}
}
return closestStn, minDist
}

73
build/tabelog.go Normal file
View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"errors"
"fmt"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
type tabelog struct {
}
func (tabelog) index(doc *goquery.Document) (string, []string) {
var reviewUrls []string
doc.Find("div.list-rst__header > p > a").Each(func(index int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok {
reviewUrls = append(reviewUrls, href)
}
})
var nextIndexUrl string
if href, ok := doc.Find("a.c-pagination__target--next").Attr("href"); ok {
nextIndexUrl = href
}
return nextIndexUrl, reviewUrls
}
func (tabelog) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) {
name = doc.Find("a.rd-header__rst-name-main").Text()
if addresses := doc.Find("p.rd-detail-info__rst-address"); addresses.Length() == 2 {
address = strings.TrimSpace(addresses.First().Text())
} else {
err = errors.New("invalid value for address")
return
}
features = make(map[string]float64)
for index, category := range []string{"dishes", "service", "atmosphere", "cost", "drinks"} {
text := doc.Find(fmt.Sprintf("#js-rating-detail > dd:nth-child(%d)", (index+1)*2)).Text()
if features[category], err = strconv.ParseFloat(text, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category)
return
}
}
return
}

74
build/tripadvisor.go Normal file
View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"errors"
"fmt"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
type tripadvisor struct {
}
func (tripadvisor) index(doc *goquery.Document) (string, []string) {
var reviewUrls []string
doc.Find("a.property_title").Each(func(index int, sel *goquery.Selection) {
if href, ok := sel.Attr("href"); ok {
reviewUrls = append(reviewUrls, href)
}
})
var nextIndexUrl string
if href, ok := doc.Find("div.deckTools.btm a.nav.next.rndBtn.rndBtnGreen.taLnk").Attr("href"); ok {
nextIndexUrl = href
}
return nextIndexUrl, reviewUrls
}
func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]float64, err error) {
name = strings.TrimSpace(doc.Find("h1#HEADING").Text())
address = strings.TrimSpace(doc.Find("address span.format_address").Text())
ratings := doc.Find("ul.barChart div.ratingRow img.sprite-rating_s_fill")
if ratings.Length() != 4 {
err = errors.New("missing rating data")
return
}
features = make(map[string]float64)
for index, category := range []string{"food", "service", "value", "atmosphere"} {
alt, _ := ratings.Eq(index).Attr("alt")
rating := strings.Split(alt, " ")[0]
if features[category], err = strconv.ParseFloat(rating, 8); err != nil {
err = fmt.Errorf("invalid value for %s", category)
return
}
}
return
}

88
build/webcache.go Normal file
View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2015 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"bytes"
"crypto/md5"
"fmt"
"io/ioutil"
"net/http"
"os"
"path"
"time"
"github.com/PuerkitoBio/goquery"
)
type webCache struct {
directory string
ticker *time.Ticker
}
func newWebCache(directory string) (*webCache, error) {
if err := os.MkdirAll(directory, 0755); err != nil {
return nil, err
}
cache := &webCache{
directory: directory,
ticker: time.NewTicker(time.Millisecond * 100),
}
return cache, nil
}
func (c *webCache) urlToLocal(url string) string {
hash := md5.New()
hash.Write([]byte(url))
return path.Join(c.directory, fmt.Sprintf("%x.html", hash.Sum(nil)))
}
func (c *webCache) load(url string) (*goquery.Document, error) {
localPath := c.urlToLocal(url)
if file, err := os.Open(localPath); err == nil {
defer file.Close()
return goquery.NewDocumentFromReader(file)
}
<-c.ticker.C
res, err := http.Get(url)
if err != nil {
return nil, err
}
defer res.Body.Close()
var buff bytes.Buffer
if _, err := buff.ReadFrom(res.Body); err != nil {
return nil, err
}
if err := ioutil.WriteFile(localPath, buff.Bytes(), 0644); err != nil {
return nil, err
}
return goquery.NewDocumentFromReader(&buff)
}