From 8365d6a65faf536767e3f470b8611bb4ced6477c Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Tue, 22 Sep 2015 11:57:07 +0900 Subject: [PATCH] Tripadvisor transitioned to toml --- build/build.go | 15 ++- build/cache/geocache.json | 84 +++++++++++++ build/converter.go | 3 +- build/data/{ => converters}/tripadvisor.toml | 31 ++--- build/data/db.sqlite3 | Bin 345088 -> 345088 bytes build/data/urls.txt | 15 --- build/tripadvisor.go | 120 ------------------- 7 files changed, 111 insertions(+), 157 deletions(-) rename build/data/{ => converters}/tripadvisor.toml (57%) delete mode 100644 build/tripadvisor.go diff --git a/build/build.go b/build/build.go index 1b41469..ac8428f 100644 --- a/build/build.go +++ b/build/build.go @@ -115,30 +115,33 @@ func main() { flag.Parse() - log.Printf("loading geocache %s...", *geocachePath) + log.Printf("loading geocache from %s...", *geocachePath) gc, err := newGeoCache(*geocachePath) if err != nil { log.Fatal(err) } defer gc.save() - log.Printf("loading webcache %s...", *webcachePath) + log.Printf("loading webcache from %s...", *webcachePath) wc, err := newWebCache(*webcachePath) if err != nil { log.Fatal(err) } - log.Printf("loading urls %s...", *urlsPath) + log.Printf("loading urls from %s...", *urlsPath) urls, err := loadUrls(*urlsPath) if err != nil { log.Fatal(err) } - log.Printf("loading converters %s...", *convertersPath) + log.Printf("loading converters from %s...", *convertersPath) converters, err := loadConverters(*convertersPath) if err != nil { log.Fatal(err) } + for _, c := range converters { + log.Printf("*\t%s", c.Name) + } log.Print("scraping reviews...") reviews, err := scrapeReviews(urls, converters, gc, wc) @@ -152,12 +155,12 @@ func main() { log.Print("computing data semantics..") computeSemantics(restaurants) - log.Print("computing station data...") + log.Printf("computing station data from %s...", *stationsPath) if err := computeStations(restaurants, *stationsPath); err != nil { log.Fatal(err) } - log.Print("saving data...") + log.Printf("saving data to %s...", *dbPath) if err := dumpData(*dbPath, restaurants); err != nil { log.Fatal(err) } diff --git a/build/cache/geocache.json b/build/cache/geocache.json index 5761060..4de1b42 100644 --- a/build/cache/geocache.json +++ b/build/cache/geocache.json @@ -10627,10 +10627,82 @@ "Latitude": 35.299925, "Longitude": 139.480876 }, + "〒220-0005西区南幸1-3-1 横浜モアーズ8F": { + "Latitude": 35.4672263, + "Longitude": 139.6221788 + }, + "〒220-0011西区高島2-13-12 崎陽軒本店B1": { + "Latitude": 35.4622889, + "Longitude": 139.6222899 + }, + "〒220-0012西区みなとみらい 4-6-2グランドセントラルタワー 1階": { + "Latitude": 35.4587197, + "Longitude": 139.6294296 + }, + "〒220-8501西区北幸1-3-23横浜ベイシェラトンホテル\u0026タワーズ 8階": { + "Latitude": 35.4667939, + "Longitude": 139.6201484 + }, + "〒220-8522西区みなとみらい1-1-1ヨコハマグランドインターコンチネンタルホテル31階": { + "Latitude": 35.4576034, + "Longitude": 139.6374556 + }, + "〒221-0835神奈川区鶴屋町2-10-7タムラビル 1F": { + "Latitude": 35.4688603, + "Longitude": 139.6229814 + }, + "〒225-0012青葉区あざみ野南2-14-3": { + "Latitude": 35.5652366, + "Longitude": 139.5523784 + }, + "〒231-0007中区弁天通6-791F": { + "Latitude": 35.4492682, + "Longitude": 139.6353589 + }, + "〒231-0014中区常盤町5-58-2": { + "Latitude": 35.4472157, + "Longitude": 139.6349617 + }, + "〒231-0021中区日本大通11 情報文化センター1F": { + "Latitude": 35.4462405, + "Longitude": 139.6428978 + }, + "〒231-0023中区山下町10 ホテルニューグランド本館5F": { + "Latitude": 35.444773, + "Longitude": 139.649593 + }, + "〒231-0023中区山下町149": { + "Latitude": 35.4436523, + "Longitude": 139.6459251 + }, + "〒231-0023中区山下町189": { + "Latitude": 35.442644, + "Longitude": 139.6468862 + }, + "〒231-0023中区山下町192": { + "Latitude": 35.4433273, + "Longitude": 139.6467417 + }, + "〒231-0023中区山下町77": { + "Latitude": 35.4441466, + "Longitude": 139.6468555 + }, + "〒231-0861中区元町1-31ラ・スピーガ元町001号室": { + "Latitude": 35.4410507, + "Longitude": 139.6503305 + }, + "〒240-0006保土ケ谷区星川3-23-13": { + "Latitude": 35.4557923, + "Longitude": 139.5825558 + }, "ラゾーナ川崎プラザ 4F": { "Latitude": 35.5329594, "Longitude": 139.6959237 }, + "中区石川町1-8": { + "Latitude": 35.4387639, + "Longitude": 139.6445226 + }, "中原区木月2-5-7": { "Latitude": 35.5647315, "Longitude": 139.6551794 @@ -10738,5 +10810,17 @@ "神奈川県鎌倉市大船1-24-1大船駅前ビル5F": { "Latitude": 35.35328, "Longitude": 139.5325737 + }, + "西区南幸1-6-31横浜タカシマヤ 8F": { + "Latitude": 35.4651997, + "Longitude": 139.6191458 + }, + "西区高島2-16-B1横浜駅東口地下街ポルタ": { + "Latitude": 35.4622889, + "Longitude": 139.6222899 + }, + "西区高島2-19-12スカイビル 11F": { + "Latitude": 35.4645561, + "Longitude": 139.6246488 } } \ No newline at end of file diff --git a/build/converter.go b/build/converter.go index 7ae7310..de70ac2 100644 --- a/build/converter.go +++ b/build/converter.go @@ -99,7 +99,7 @@ func (l *selector) locateStrings(doc *goquery.Document) ([]string, error) { } } - strs = append(strs, str) + strs = append(strs, strings.TrimSpace(str)) }) return strs, err @@ -137,6 +137,7 @@ func (l *selector) locateFloat(doc *goquery.Document) (float64, error) { // converter // type converter struct { + Name string Domains []string Index struct { diff --git a/build/data/tripadvisor.toml b/build/data/converters/tripadvisor.toml similarity index 57% rename from build/data/tripadvisor.toml rename to build/data/converters/tripadvisor.toml index 1a8ae00..4c4c414 100644 --- a/build/data/tripadvisor.toml +++ b/build/data/converters/tripadvisor.toml @@ -1,4 +1,5 @@ name = "tripadvisor" +domains = ["www.tripadvisor.com"] [index.items] path = "a.property_title" @@ -12,53 +13,53 @@ name = "tripadvisor" path = "h1#HEADING" [item.address] - path = "address span.format_address > span:not(.extended-address)" + path = "address span.format_address" -[item.weight] +[item.count] path = "h3.reviews_header" - regex = "^(\d+)" + regEx = "^(\\d+)" [item.props] [item.props.service] accomodating = 1.0 affordable = 0.0 atmospheric = 0.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(2)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(1) > div:nth-child(2) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.food] accomodating = 0.0 affordable = 0.0 atmospheric = 0.0 - delicious: 1.0 + delicious = 1.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(1)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(1) > div:nth-child(1) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.value] accomodating = 0.0 affordable = 1.0 atmospheric = 0.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(3)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(2) > div:nth-child(1) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" [item.props.atmosphere] accomodating = 0.0 affordable = 0.0 atmospheric = 1.0 - delicious: 0.0 + delicious = 0.0 scale = 5.0 - path = "ul.barChart div.ratingRow img:nth-child(4)" - regex = "^([0-9]*\.?[0-9]+)" + path = "ul.barChart > li:nth-child(2) > div:nth-child(2) img" + regEx = "^([0-9]*\\.?[0-9]+)" attr = "alt" diff --git a/build/data/db.sqlite3 b/build/data/db.sqlite3 index dd8762d880e6de459e21b82fd3d83e5eae88fe0c..1089a9300b8cdcadab22a2f824381611631045b5 100644 GIT binary patch delta 939 zcmXZb`%_F|7{KxO-91}JHA7g^hQ$iwR7frjiQKl_&bG-tl*_nZDwWtJw2T?H!r2RKhB!>5uN<9)wlG11D>8;XMS8V*qtg?=%zBW}+OhvU1kiSedrN9nS_idH-kA|P5Q z9^(=2;Xdx-4sPQXZlW1Ca2?lh6<2T>m(YZB2;(fy;2TaOghrggNpO*(0mu2iI7Sh~ zL$DOxaNsEFa0Ioe!68(m3YDnGPmIO`c<~*FF&=wx5JRyao3RNSQHBj}!g}<>I;_PS ztmY~ud6m)YN;4EsnbeI6Pr201m?!javU%;&jC1$JH{~->ty9X9NG#$1u^z!!R5pFcNNzf(K)fj&Ya(4H?Kpmev0x xipiLQY)nNCreQi}AQyRk0{y8mybV>A4IHwl@1=ljmR z|C=>h?=@QQ*7(>llEh|8k`w_pg4&I#M!aeGXLvYXG4U%bb=|mpHp@J6u`4@ToP3z^ zRbxWehPyrZPh{4!k3FT2MTw{4XYr%>UOW=E0= zR;hhtV|3#SyA6BrH(h(&ceovu>k1AC6nBt+1 z-&gv=A1JZ#UzI5MxPp!ur@9}mgg&BnPg1uZS3#%dL8qzWnHkX80_fZ%sAD{Iel&Dp zB=kuN^l1`wsXuhN7c>Zj`_iLvkiw*$5xN-GdQ{ghhkK~kNFeCk0%3Tb3+Rh2zzj^_ zB|wAB2Gmy;$OKfDMwpa@I_)4+5v6O@25@G>X`b1+GUV>ze*a{+B%7eHqYowk)= zp&pXKB0$?s=Y$K`K?-;!^lf}Xj|hxg*rhD~9j|5gS&4Kq6x&OmqHwi7I<6+-*&~rj zMOK$%sWsbR%+wb-?6SS8*6mpAaE7+G4-B1I+kO7!goC>DhCRPu`SVFawn|(!g||<^^0XIS9+*T(V4E!3UNAVlGRo3bm;S)vejO# zpC&u2YOB)?>G(@)s~gEjw$+Z~-4}I!zv=3z{@YG9Tr^uc&Ru!qa*bK8&n+$b(qYzB zhss;qwIZv{ZI_iU>dI~d6xOA)4V_fHtlD1RG}M)D7-PsV8q)Lh1=bqbRcp1$INEL2 zdbzD`2Ty)Ik4LuMpT{EFdX5(_4h&Ztqm!(iYaO7IEKax0VRc$vZll4N5xRMHM8%!2 z1>LEH|W0i1>Gu>Gr0@!?)~ZMRKF; zl5JLx+7O*2Kc7bNj0$~j;q=1k`J>z9CN{kC^6k_K?cbj^U8c5=Wt;vO_?48O&TW{N zVzzu3)8UeT!5}QBOIklQ-Al|`aozY%k!-iiRz*_X&$kl|l*leutvkbD%r<1`%c|}6 z(3|%L&A+kp%gBk#cbPsvtnb*m`x{f|cbjh2pIqAf1e_sQ_m}joc(F-Beff0MGL=mCE)ShpX0sK^rpKTCTFpw$?+eU)zm{un4``vbS~4 zzA@*_@^US?iznP1Ir)@jXy1)Ra-D2*JLMgE-EgFqcWz<{3PdCv!dT&wKH1 zT*o>4js492!@gtRuzTzd`+k(_#qD4>H)VRFje=r~ zMptZ>MtP8_QRIx)3SgsVfipBp-*k<#FioT68>La+jnoFg29HL$ma0+O4AWxZp&G?) zvWBpgh6Go`gM)53DM;76K1dms7^F-X7^K)35TsCv3sT{PAQkQ(q{97z6vw@T6u`ZL z6tz8r6tF#ldib>OC);i|xLxH>>lSrecTtqoAGwgC0I zEI`rP?4!hAfqJBdibVq6%F=9KZE!A zb#S4-JABpO8-B%4S)bykLD~H@G?zaO?(5HjSNJKE7y0Q%B>M~Dg??I4rJs^~fu9yS z-%lIV#ZL{)^HYA#^{<30{8YDGeF6E-m>P{2b5y#Cl&W-Vd09<@%hVLOM5R5OsnY&U zS7|S&sWjfHDvi2GrPF$fO4**TQm=XHLU^)DyYPg%<~?@YnzqE(lPsvjaKsl0cz1bx$fw$fspT;KnH$_>a7j zU1IrC`~UBwc>hn2dbNLa(&WzTIZ-E-%k{E#l^b!HDIrOgAG7@;>@?Sb^0k*Sp*{Ss9|nMbCBo zDK%={c9(2LZrE(qb_6tTHx7NCd#Tlhu+$g1YpU(;IvJsY{4z%A3mkUav-jGzcMnTz zyHO-!N9@nu5DHY7Wk*E9Phy(sw@cT~ls9kq<4Zhhzv%d?&XU2hLIyYx$IA-C4nF-K DF)gZ8 diff --git a/build/data/urls.txt b/build/data/urls.txt index b378c1e..850c5d2 100644 --- a/build/data/urls.txt +++ b/build/data/urls.txt @@ -1,16 +1 @@ -http://tabelog.com/en/kanagawa/rstLst/ - http://www.tripadvisor.com/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021282-Sagamihara_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html - -http://www.tripadvisor.com/Restaurants-g303156-Kamakura_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g298174-Yokosuka_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021278-Odawara_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g681222-Hiratsuka_Kanagawa_Prefecture_Kanto.html - -http://www.tripadvisor.com/Restaurants-g298169-Atsugi_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021286-Yamato_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html -http://www.tripadvisor.com/Restaurants-g1021285-Hadano_Kanagawa_Prefecture_Kanto.html diff --git a/build/tripadvisor.go b/build/tripadvisor.go deleted file mode 100644 index 3e821ad..0000000 --- a/build/tripadvisor.go +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2015 Alex Yatskov - * Author: Alex Yatskov - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of - * the Software, and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package main - -import ( - "errors" - "fmt" - "strconv" - "strings" - - "github.com/PuerkitoBio/goquery" -) - -type tripadvisor struct { -} - -func (tripadvisor) define(keyword string) semantics { - return map[string]semantics{ - "food": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 1.0}, - "service": {Accomodating: 1.0, Affordable: 0.0, Atmospheric: 0.0, Delicious: 0.0}, - "value": {Accomodating: 0.0, Affordable: 1.0, Atmospheric: 0.0, Delicious: 0.0}, - "atmosphere": {Accomodating: 0.0, Affordable: 0.0, Atmospheric: 1.0, Delicious: 0.0}, - }[keyword] -} - -func (tripadvisor) index(doc *goquery.Document) (string, []string) { - var reviewUrls []string - doc.Find("a.property_title").Each(func(index int, sel *goquery.Selection) { - if href, ok := sel.Attr("href"); ok { - reviewUrls = append(reviewUrls, href) - } - }) - - var nextIndexUrl string - if href, ok := doc.Find("div.deckTools.btm a.nav.next.rndBtn.rndBtnGreen.taLnk").Attr("href"); ok { - nextIndexUrl = href - } - - return nextIndexUrl, reviewUrls -} - -func (tripadvisor) review(doc *goquery.Document) (name, address string, features map[string]float64, weight float64, err error) { - if name = strings.TrimSpace(doc.Find("h1#HEADING").Text()); len(name) == 0 { - err = errors.New("invalid name") - return - } - - { - var addressParts []string - doc.Find("address span.format_address > span:not(.extended-address)").Each(func(index int, sel *goquery.Selection) { - addressParts = append(addressParts, strings.TrimSpace(sel.Text())) - }) - - if len(addressParts) == 0 { - err = errors.New("invalid address") - return - } - - address = strings.Join(addressParts, " ") - } - - { - ratings := doc.Find("ul.barChart div.ratingRow img.sprite-rating_s_fill") - if ratings.Length() != 4 { - err = errors.New("invalid ratings") - return - } - - features = make(map[string]float64) - for index, category := range []string{"food", "service", "value", "atmosphere"} { - altText, _ := ratings.Eq(index).Attr("alt") - valueText := strings.Split(altText, " ")[0] - - var value float64 - if value, err = strconv.ParseFloat(valueText, 8); err != nil { - err = fmt.Errorf("invalid rating for %s", category) - return - } - - features[category] = value/2.5 - 1.0 - } - } - - { - weightValid := false - if weightParts := strings.Split(doc.Find("h3.reviews_header").Text(), " "); len(weightParts) > 0 { - if weight, err = strconv.ParseFloat(weightParts[0], 8); err == nil { - weightValid = true - return - } - } - - if !weightValid { - err = fmt.Errorf("invalid review count") - return - } - } - - return -}