Read URLs from command line instead of file
This commit is contained in:
parent
f23e54c022
commit
7b3ee7cc32
@ -23,10 +23,9 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"flag"
|
"flag"
|
||||||
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
_ "github.com/mattn/go-sqlite3"
|
_ "github.com/mattn/go-sqlite3"
|
||||||
@ -66,27 +65,12 @@ func loadConverters(directory string) ([]*converter, error) {
|
|||||||
return convs, nil
|
return convs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadUrls(filename string) ([]string, error) {
|
|
||||||
file, err := os.Open(filename)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
var urls []string
|
|
||||||
for scanner := bufio.NewScanner(file); scanner.Scan(); {
|
|
||||||
if url := scanner.Text(); len(url) > 0 {
|
|
||||||
urls = append(urls, url)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return urls, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *webCache) ([]review, error) {
|
func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *webCache) ([]review, error) {
|
||||||
var reviews []review
|
var reviews []review
|
||||||
|
|
||||||
for _, u := range urls {
|
for _, u := range urls {
|
||||||
|
var scraped bool
|
||||||
|
|
||||||
for _, c := range converters {
|
for _, c := range converters {
|
||||||
if !c.compatible(u) {
|
if !c.compatible(u) {
|
||||||
continue
|
continue
|
||||||
@ -98,8 +82,13 @@ func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *web
|
|||||||
}
|
}
|
||||||
|
|
||||||
reviews = append(reviews, revs...)
|
reviews = append(reviews, revs...)
|
||||||
|
scraped = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !scraped {
|
||||||
|
return nil, fmt.Errorf("no converters found for %s", u)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return reviews, nil
|
return reviews, nil
|
||||||
@ -108,7 +97,6 @@ func scrapeReviews(urls []string, converters []*converter, gc *geoCache, wc *web
|
|||||||
func main() {
|
func main() {
|
||||||
var (
|
var (
|
||||||
dbPath = flag.String("db", "data/db.sqlite3", "database output path")
|
dbPath = flag.String("db", "data/db.sqlite3", "database output path")
|
||||||
urlsPath = flag.String("urls", "data/urls.txt", "index URLs to scrape")
|
|
||||||
convertersPath = flag.String("converters", "data/converters", "directory for converters")
|
convertersPath = flag.String("converters", "data/converters", "directory for converters")
|
||||||
stationsPath = flag.String("stations", "data/stations.json", "station geolocation data")
|
stationsPath = flag.String("stations", "data/stations.json", "station geolocation data")
|
||||||
geocachePath = flag.String("geocache", "cache/geocache.json", "geolocation data cache")
|
geocachePath = flag.String("geocache", "cache/geocache.json", "geolocation data cache")
|
||||||
@ -117,6 +105,10 @@ func main() {
|
|||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
if flag.NArg() == 0 {
|
||||||
|
log.Fatal("no URLs specified on command line")
|
||||||
|
}
|
||||||
|
|
||||||
log.Printf("loading geocache from %s...", *geocachePath)
|
log.Printf("loading geocache from %s...", *geocachePath)
|
||||||
gc, err := newGeoCache(*geocachePath)
|
gc, err := newGeoCache(*geocachePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -130,12 +122,6 @@ func main() {
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("loading urls from %s...", *urlsPath)
|
|
||||||
urls, err := loadUrls(*urlsPath)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("loading converters from %s...", *convertersPath)
|
log.Printf("loading converters from %s...", *convertersPath)
|
||||||
converters, err := loadConverters(*convertersPath)
|
converters, err := loadConverters(*convertersPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -146,7 +132,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
log.Print("scraping reviews...")
|
log.Print("scraping reviews...")
|
||||||
reviews, err := scrapeReviews(urls, converters, gc, wc)
|
reviews, err := scrapeReviews(flag.Args(), converters, gc, wc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user