From eb4c14d5aabe21ea2993f20b1b3cc83153c734d6 Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Fri, 21 Aug 2015 18:21:52 +0900 Subject: [PATCH] WIP --- scrape.go | 70 ++++++++++++++++++++++++++++++++++--------------------- urls.txt | 29 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 27 deletions(-) create mode 100644 urls.txt diff --git a/scrape.go b/scrape.go index 2f3a5a5..67d375c 100644 --- a/scrape.go +++ b/scrape.go @@ -22,11 +22,45 @@ package main -import "log" +import ( + "bufio" + "errors" + "log" + "net/url" + "os" +) -type scrapeTask struct { - url string - scr scraper +func scrapeUrls(filename string, wc *webCache, gc *geoCache) ([]restaurant, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + var results []restaurant + scanner := bufio.NewScanner(file) + for scanner.Scan() { + if line := scanner.Text(); len(line) > 0 { + parsed, err := url.Parse(line) + if err != nil { + return nil, err + } + + var items []restaurant + switch parsed.Host { + case "tabelog.com": + items = scrape(line, wc, gc, tabelog{}) + case "www.tripadvisor.com": + items = scrape(line, wc, gc, tripadvisor{}) + default: + return nil, errors.New("unsupported review site") + } + + results = append(results, items...) + } + } + + return results, nil } func main() { @@ -41,28 +75,10 @@ func main() { log.Fatal(err) } - tasks := []scrapeTask{ - {"http://tabelog.com/en/kanagawa/rstLst/1/", tabelog{}}, - - {"http://www.tripadvisor.com/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021282-Sagamihara_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g303156-Kamakura_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g298174-Yokosuka_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021278-Odawara_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g681222-Hiratsuka_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g298169-Atsugi_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021286-Yamato_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, - {"http://www.tripadvisor.com/Restaurants-g1021285-Hadano_Kanagawa_Prefecture_Kanto.html", tripadvisor{}}, + restaurants, err := scrapeUrls("urls.txt", wc, gc) + if err == nil { + log.Print(len(restaurants)) + } else { + log.Fatal(err) } - - count := 0 - for _, task := range tasks { - restaraunts := scrape(task.url, wc, gc, task.scr) - count += len(restaraunts) - } - - log.Print(count) } diff --git a/urls.txt b/urls.txt new file mode 100644 index 0000000..44b0baa --- /dev/null +++ b/urls.txt @@ -0,0 +1,29 @@ +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1401&LstAre=A140101&lat=35.465808055555996&lon=139.61964361111&zoom=16&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1405&LstAre=A140501&lat=35.526987222222&lon=139.70313527778&zoom=15&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1407&LstAre=A140701&lat=35.562953333333&lon=139.36054222222&zoom=12&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1404&LstAre=A140404&lat=35.348035555556&lon=139.46422555556&zoom=13&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= + +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1404&LstAre=A140402&lat=35.307700833333&lon=139.53340527778&zoom=13&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1406&LstAre=A140601&lat=35.276735833333&lon=139.67605583333&zoom=12&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1409&LstAre=A140901&lat=35.256132777778&lon=139.15514944443999&zoom=14&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1404&LstAre=A140407&lat=35.330531666667&lon=139.345865&zoom=14&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= + +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1408&LstAre=A140802&lat=35.439903888889&lon=139.36311694443998&zoom=14&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1407&LstAre=A140702&lat=35.472693611111005&lon=139.46422555556&zoom=12&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1404&LstAre=A140406&lat=35.355736111111&lon=139.40259916667&zoom=13&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= +http://tabelog.com/en/kanagawa/rstLst/?utf8=%E2%9C%93&maxLat=&minLat=&maxLon=&minLon=&LstPrf=A1408&LstAre=A140804&lat=35.377153611111005&lon=139.20982333333&zoom=13&genre_name=&RdoCosTp=2&LstCos=0&LstCosT=11&ChkCard=&LstSitu=0&LstRev=0&LstReserve=0&ChkParking=0&LstSmoking=0&SrtT=trend&PG= + +http://www.tripadvisor.com/Restaurants-g298173-Yokohama_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g298172-Kawasaki_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021282-Sagamihara_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021277-Fujisawa_Kanagawa_Prefecture_Kanto.html + +http://www.tripadvisor.com/Restaurants-g303156-Kamakura_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g298174-Yokosuka_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021278-Odawara_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g681222-Hiratsuka_Kanagawa_Prefecture_Kanto.html + +http://www.tripadvisor.com/Restaurants-g298169-Atsugi_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021286-Yamato_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021279-Chigasaki_Kanagawa_Prefecture_Kanto.html +http://www.tripadvisor.com/Restaurants-g1021285-Hadano_Kanagawa_Prefecture_Kanto.html