diff --git a/build/build.go b/build/build.go index 1b41469..ac8428f 100644 --- a/build/build.go +++ b/build/build.go @@ -115,30 +115,33 @@ func main() { flag.Parse() - log.Printf("loading geocache %s...", *geocachePath) + log.Printf("loading geocache from %s...", *geocachePath) gc, err := newGeoCache(*geocachePath) if err != nil { log.Fatal(err) } defer gc.save() - log.Printf("loading webcache %s...", *webcachePath) + log.Printf("loading webcache from %s...", *webcachePath) wc, err := newWebCache(*webcachePath) if err != nil { log.Fatal(err) } - log.Printf("loading urls %s...", *urlsPath) + log.Printf("loading urls from %s...", *urlsPath) urls, err := loadUrls(*urlsPath) if err != nil { log.Fatal(err) } - log.Printf("loading converters %s...", *convertersPath) + log.Printf("loading converters from %s...", *convertersPath) converters, err := loadConverters(*convertersPath) if err != nil { log.Fatal(err) } + for _, c := range converters { + log.Printf("*\t%s", c.Name) + } log.Print("scraping reviews...") reviews, err := scrapeReviews(urls, converters, gc, wc) @@ -152,12 +155,12 @@ func main() { log.Print("computing data semantics..") computeSemantics(restaurants) - log.Print("computing station data...") + log.Printf("computing station data from %s...", *stationsPath) if err := computeStations(restaurants, *stationsPath); err != nil { log.Fatal(err) } - log.Print("saving data...") + log.Printf("saving data to %s...", *dbPath) if err := dumpData(*dbPath, restaurants); err != nil { log.Fatal(err) } diff --git a/build/cache/geocache.json b/build/cache/geocache.json index 5761060..4de1b42 100644 --- a/build/cache/geocache.json +++ b/build/cache/geocache.json @@ -10627,10 +10627,82 @@ "Latitude": 35.299925, "Longitude": 139.480876 }, + "〒220-0005西区南幸1-3-1 横浜モアーズ8F": { + "Latitude": 35.4672263, + "Longitude": 139.6221788 + }, + "〒220-0011西区高島2-13-12 崎陽軒本店B1": { + "Latitude": 35.4622889, + "Longitude": 139.6222899 + }, + "〒220-0012西区みなとみらい 4-6-2グランドセントラルタワー 1階": { + "Latitude": 35.4587197, + "Longitude": 139.6294296 + }, + "〒220-8501西区北幸1-3-23横浜ベイシェラトンホテル\u0026タワーズ 8階": { + "Latitude": 35.4667939, + "Longitude": 139.6201484 + }, + "〒220-8522西区みなとみらい1-1-1ヨコハマグランドインターコンチネンタルホテル31階": { + "Latitude": 35.4576034, + "Longitude": 139.6374556 + }, + "〒221-0835神奈川区鶴屋町2-10-7タムラビル 1F": { + "Latitude": 35.4688603, + "Longitude": 139.6229814 + }, + "〒225-0012青葉区あざみ野南2-14-3": { + "Latitude": 35.5652366, + "Longitude": 139.5523784 + }, + "〒231-0007中区弁天通6-791F": { + "Latitude": 35.4492682, + "Longitude": 139.6353589 + }, + "〒231-0014中区常盤町5-58-2": { + "Latitude": 35.4472157, + "Longitude": 139.6349617 + }, + "〒231-0021中区日本大通11 情報文化センター1F": { + "Latitude": 35.4462405, + "Longitude": 139.6428978 + }, + "〒231-0023中区山下町10 ホテルニューグランド本館5F": { + "Latitude": 35.444773, + "Longitude": 139.649593 + }, + "〒231-0023中区山下町149": { + "Latitude": 35.4436523, + "Longitude": 139.6459251 + }, + "〒231-0023中区山下町189": { + "Latitude": 35.442644, + "Longitude": 139.6468862 + }, + "〒231-0023中区山下町192": { + "Latitude": 35.4433273, + "Longitude": 139.6467417 + }, + "〒231-0023中区山下町77": { + "Latitude": 35.4441466, + "Longitude": 139.6468555 + }, + "〒231-0861中区元町1-31ラ・スピーガ元町001号室": { + "Latitude": 35.4410507, + "Longitude": 139.6503305 + }, + "〒240-0006保土ケ谷区星川3-23-13": { + "Latitude": 35.4557923, + "Longitude": 139.5825558 + }, "ラゾーナ川崎プラザ 4F": { "Latitude": 35.5329594, "Longitude": 139.6959237 }, + "中区石川町1-8": { + "Latitude": 35.4387639, + "Longitude": 139.6445226 + }, "中原区木月2-5-7": { "Latitude": 35.5647315, "Longitude": 139.6551794 @@ -10738,5 +10810,17 @@ "神奈川県鎌倉市大船1-24-1大船駅前ビル5F": { "Latitude": 35.35328, "Longitude": 139.5325737 + }, + "西区南幸1-6-31横浜タカシマヤ 8F": { + "Latitude": 35.4651997, + "Longitude": 139.6191458 + }, + "西区高島2-16-B1横浜駅東口地下街ポルタ": { + "Latitude": 35.4622889, + "Longitude": 139.6222899 + }, + "西区高島2-19-12スカイビル 11F": { + "Latitude": 35.4645561, + "Longitude": 139.6246488 } } \ No newline at end of file diff --git a/build/converter.go b/build/converter.go index 7ae7310..de70ac2 100644 --- a/build/converter.go +++ b/build/converter.go @@ -99,7 +99,7 @@ func (l *selector) locateStrings(doc *goquery.Document) ([]string, error) { } } - strs = append(strs, str) + strs = append(strs, strings.TrimSpace(str)) }) return strs, err @@ -137,6 +137,7 @@ func (l *selector) locateFloat(doc *goquery.Document) (float64, error) { // converter // type converter struct { + Name string Domains []string Index struct { diff --git a/build/data/tripadvisor.toml b/build/data/converters/tripadvisor.toml similarity index 57% rename from build/data/tripadvisor.toml rename to build/data/converters/tripadvisor.toml index 1a8ae00..4c4c414 100644 --- a/build/data/tripadvisor.toml +++ b/build/data/converters/tripadvisor.toml @@ -1,4 +1,5 @@ name = "tripadvisor" +domains = ["www.tripadvisor.com"] [index.items] path = "a.property_title" @@ -12,53 +13,53 @@ name = "tripadvisor" path = "h1#HEADING" [item.address] - path = "address span.format_address > span:not(.extended-address)" + path = "address span.format_address" -[item.weight] +[item.count] path = "h3.reviews_header" - regex = "^(\d+)" + regEx = "^(\\d+)" [item.props] [item.props.service] accomodating = 1.0 affordable = 0.0 atmospheric = 0.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(2)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(1) > div:nth-child(2) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.food] accomodating = 0.0 affordable = 0.0 atmospheric = 0.0 - delicious: 1.0 + delicious = 1.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(1)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(1) > div:nth-child(1) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.value] accomodating = 0.0 affordable = 1.0 atmospheric = 0.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(3)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(2) > div:nth-child(1) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.atmosphere] accomodating = 0.0 affordable = 0.0 atmospheric = 1.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(4)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(2) > div:nth-child(2) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" diff --git a/build/data/db.sqlite3 b/build/data/db.sqlite3 index dd8762d..1089a93 100644 Binary files a/build/data/db.sqlite3 and b/build/data/db.sqlite3 differ diff --git a/build/data/urls.txt b/build/data/urls.txt index b378c1e..850c5d2 100644 --- a/build/data/urls.txt +++ b/build/data/urls.txt @@ -1,16 +1 @@ -http://tabelog.com/en/kanagawa/rstLst/ - http://www.tripadvisor.com/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021282-Sagamihara_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html - -http://www.tripadvisor.com/Restaurants-g303156-Kamakura_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g298174-Yokosuka_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021278-Odawara_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g681222-Hiratsuka_Kanagawa_Prefecture_Kanto.html - -http://www.tripadvisor.com/Restaurants-g298169-Atsugi_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021286-Yamato_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021285-Hadano_Kanagawa_Prefecture_Kanto.html diff --git a/build/tripadvisor.go b/build/tripadvisor.go deleted file mode 100644 index 3e821ad..0000000 --- a/build/tripadvisor.go +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2015 Alex Yatskov - * Author: Alex Yatskov - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of - * the Software, and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package main - -import ( - "errors" - "fmt" - "strconv" - "strings" - - "github.com/PuerkitoBio/goquery" -) - -type tripadvisor struct { -} - -func (tripadvisor) define(keyword string) semantics { - return map[string]semantics{ - "food": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 1.0}, - "service": {Accomodating: 1.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.0}, - "value": {Accomodating: 0.0, Affordable: 1.0, Atmospheric: 0.0, Delicious: 0.0}, - "atmosphere": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 1.0, Delicious: 0.0}, - }[keyword] -} - -func (tripadvisor) index(doc *goquery.Document) (string, []string) { - var reviewUrls []string - doc.Find("a.property_title").Each(func(index int, sel *goquery.Selection) { - if href, ok := sel.Attr("href"); ok { - reviewUrls = append(reviewUrls, href) - } - }) - - var nextIndexUrl string - if href, ok := doc.Find("div.deckTools.btm a.nav.next.rndBtn.rndBtnGreen.taLnk").Attr("href"); ok { - nextIndexUrl = href - } - - return nextIndexUrl, reviewUrls -} - -func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]float64, weight float64, err error) { - if name = strings.TrimSpace(doc.Find("h1#HEADING").Text()); len(name) == 0 { - err = errors.New("invalid name") - return - } - - { - var addressParts []string - doc.Find("address span.format_address > span:not(.extended-address)").Each(func(index int, sel *goquery.Selection) { - addressParts = append(addressParts, strings.TrimSpace(sel.Text())) - }) - - if len(addressParts) == 0 { - err = errors.New("invalid address") - return - } - - address = strings.Join(addressParts, " ") - } - - { - ratings := doc.Find("ul.barChart div.ratingRow img.sprite-rating_s_fill") - if ratings.Length() != 4 { - err = errors.New("invalid ratings") - return - } - - features = make(map[string]float64) - for index, category := range []string{"food", "service", "value", "atmosphere"} { - altText, _ := ratings.Eq(index).Attr("alt") - valueText := strings.Split(altText, " ")[0] - - var value float64 - if value, err = strconv.ParseFloat(valueText, 8); err != nil { - err = fmt.Errorf("invalid rating for %s", category) - return - } - - features[category] = value/2.5 - 1.0 - } - } - - { - weightValid := false - if weightParts := strings.Split(doc.Find("h3.reviews_header").Text(), " "); len(weightParts) > 0 { - if weight, err = strconv.ParseFloat(weightParts[0], 8); err == nil { - weightValid = true - return - } - } - - if !weightValid { - err = fmt.Errorf("invalid review count") - return - } - } - - return -}