commit 95232a44b9c423a153cabed0f6080b237b4aa568 Author: Alex Yatskov Date: Sat Jan 18 18:46:06 2020 -0800 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d414dc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +rtk-scrape* diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..1da3d64 --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +module github.com/FooSoft/rtk-scrape + +go 1.13 + +require ( + github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f + github.com/headzoo/surf v1.0.0 + github.com/pacificporter/surf v1.0.3 + gopkg.in/headzoo/surf.v1 v1.0.0 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..99e69cf --- /dev/null +++ b/go.sum @@ -0,0 +1,30 @@ +github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f h1:cWOyRTtBcTBjB0c+GyaQaXgP3g1HVM1KbvZL/Q5QNAM= +github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= +github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= +github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-ole/go-ole v1.2.5-0.20190708054153-938323a72016/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM= +github.com/headzoo/surf v1.0.0 h1:d2h9ftKeQYj7tKqAjQtAA0lJVkO8cTxvzdXLynmNnHM= +github.com/headzoo/surf v1.0.0/go.mod h1:/bct0m/iMNEqpn520y01yoaWxsAEigGFPnvyR1ewR5M= +github.com/headzoo/ut v0.0.0-20181013193318-a13b5a7a02ca/go.mod h1:8926sG02TCOX4RFRzIMFIzRw4xuc/TwO2gtN7teMJZ4= +github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c h1:Jf20xV/yR/O6eSUqLTuXhka/+54YR59sGwN7b3MkxYk= +github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c/go.mod h1:oRXh43p/JW9kWosasd+2kHfDpb1ec4m7YrZ5E39s1iI= +github.com/pacificporter/surf v1.0.3 h1:hCAzcBQpTle3ZOiOXMpFW1v+S3iN7IlSX2N6d8RIyMo= +github.com/pacificporter/surf v1.0.3/go.mod h1:Fk2Km1poD/rB2o9a+1USaMgUX2lTVu/xC+swj6m/IAM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/shirou/gopsutil v2.19.12+incompatible h1:WRstheAymn1WOPesh+24+bZKFkqrdCR8JOc77v4xV3Q= +github.com/shirou/gopsutil v2.19.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 h1:p9xBe/w/OzkeYVKm234g55gMdD1nSIooTir5kV11kfA= +golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20191010194322-b09406accb47 h1:/XfQ9z7ib8eEJX2hdgFTZJ/ntt0swNk5oYBziWeTCvY= +golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +gopkg.in/headzoo/surf.v1 v1.0.0 h1:Ti4LagTvHxSdHYHf5DTqJRhY4+pQYZ0slBPlxo2IWGU= +gopkg.in/headzoo/surf.v1 v1.0.0/go.mod h1:T0BH8276y+OPL0E4tisxCFjBVIAKGbwdYU7AS7/EpQQ= diff --git a/main.go b/main.go new file mode 100644 index 0000000..abe955a --- /dev/null +++ b/main.go @@ -0,0 +1,203 @@ +package main + +import ( + "bufio" + "encoding/json" + "errors" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "regexp" + "sort" + "strconv" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/headzoo/surf" + "github.com/headzoo/surf/agent" + "github.com/headzoo/surf/browser" +) + +type StoryEntry struct { + Author string `json:"author"` + Content string `json:"content"` + ModifiedDate string `json:"modifiedDate"` + StarredCount int `json:"starredCount"` + ReportedCount int `json:"reportedCount"` +} + +type KanjiEntry struct { + Character string `json:"character"` + Reading string `json:"reading"` + FrameNumber int `json:"frameNumber"` + StrokeCount int `json:"strokeCount"` + Story string `json:"story"` + Stories StoryEntryList `json:"stories"` +} + +type StoryEntryList []StoryEntry + +func (e StoryEntryList) Len() int { + return len(e) +} + +func (e StoryEntryList) Less(i, j int) bool { + return e[i].StarredCount > e[j].StarredCount +} + +func (e StoryEntryList) Swap(i, j int) { + e[i], e[j] = e[j], e[i] +} + +func login(br *browser.Browser, username, password string) error { + loginUrl := "http://kanji.koohii.com/login" + if err := br.Open(loginUrl); err != nil { + return err + } + + fm, err := br.Form("form") + if err != nil { + return errors.New("login form not found") + } + + if err := fm.Input("username", username); err != nil { + return err + } + + if err := fm.Input("password", password); err != nil { + return err + } + + if err := fm.Submit(); err != nil { + return err + } + + if br.Title() == "Sign In - Kanji Koohii" { + return errors.New("failed to sign in") + } + + return nil +} + +func scrape(br *browser.Browser, lookup string) (*KanjiEntry, error) { + if err := br.Open(fmt.Sprintf("http://kanji.koohii.com/study/kanji/%s", lookup)); err != nil { + return nil, err + } + + var kanji KanjiEntry + kanji.Character = strings.TrimSpace(br.Find("div.kanji span.cj-k").Text()) + kanji.Reading = strings.TrimSpace(br.Find("div.strokecount span.cj-k").Text()) + kanji.FrameNumber, _ = strconv.Atoi(strings.TrimSpace(br.Find("div.framenum").Text())) + kanji.StrokeCount, _ = strconv.Atoi(strings.Split(strings.TrimSpace(br.Find("div.strokecount").Text()), " ")[0]) + + if kanji.Story = strings.TrimSpace(br.Find("div#sv-textarea").Text()); kanji.Story == "[ click here to enter your story ]" { + kanji.Story = "" + } + + if matches := regexp.MustCompile(`\[(\d+)\]`).FindStringSubmatch(br.Find("div.strokecount").Text()); matches != nil { + kanji.StrokeCount, _ = strconv.Atoi(matches[1]) + } + + br.Find("div.sharedstory").Each(func(i int, s *goquery.Selection) { + var story StoryEntry + story.Author = strings.TrimSpace(s.Find("div.sharedstory_author a").Text()) + story.Content = strings.TrimSpace(s.Find("div.story").Text()) + story.ModifiedDate = strings.TrimSpace(s.Find("div.lastmodified").Text()) + story.StarredCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsStar").Text())) + story.ReportedCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsReport").Text())) + kanji.Stories = append(kanji.Stories, story) + }) + + sort.Sort(kanji.Stories) + + return &kanji, nil +} + +func load(path string) ([]string, error) { + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + var lines []string + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + + return lines, nil +} + +func save(path string, kanjiList []*KanjiEntry) error { + data, err := json.MarshalIndent(kanjiList, "", " ") + if err != nil { + return err + } + + if err := ioutil.WriteFile(path, data, 0644); err != nil { + return err + } + + return nil +} + +func main() { + var ( + username = flag.String("username", "", "login username for kanji.koohii.com") + password = flag.String("password", "", "login password for kanji.koohii.com") + firstFrame = flag.Int("firstFrame", 1, "kanji first frame") + lastFrame = flag.Int("lastFrame", 3030, "kanji last frame") + ) + + flag.Parse() + + args := flag.Args() + if len(*username) == 0 || len(*password) == 0 || len(args) == 0 || *firstFrame >= *lastFrame { + flag.Usage() + os.Exit(2) + } + + br := surf.NewBrowser() + br.SetUserAgent(agent.Firefox()) + br.AddRequestHeader("Accept", "text/html") + br.AddRequestHeader("Accept-Charset", "utf8") + + log.Println("logging in...") + if err := login(br, *username, *password); err != nil { + log.Fatal(err) + } + + var lookups []string + if len(args) >= 2 { + log.Printf("loading from %s...", args[1]) + var err error + if lookups, err = load(args[1]); err != nil { + log.Fatal(err) + } + } else { + for i := *firstFrame; i <= *lastFrame; i++ { + lookups = append(lookups, strconv.Itoa(i)) + } + } + + var kanjiList []*KanjiEntry + for _, lookup := range lookups { + log.Printf("scraping %s...", lookup) + kanji, err := scrape(br, lookup) + if err != nil { + log.Fatal(err) + } + + kanjiList = append(kanjiList, kanji) + time.Sleep(2 * time.Second) + } + + log.Printf("saving to %s...", args[0]) + if err := save(args[0], kanjiList); err != nil { + log.Fatal(err) + } +}