Initial commit

This commit is contained in:
Alex Yatskov 2020-01-18 18:46:06 -08:00
commit 95232a44b9
4 changed files with 244 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
rtk-scrape*

10
go.mod Normal file
View File

@ -0,0 +1,10 @@
module github.com/FooSoft/rtk-scrape
go 1.13
require (
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f
github.com/headzoo/surf v1.0.0
github.com/pacificporter/surf v1.0.3
gopkg.in/headzoo/surf.v1 v1.0.0
)

30
go.sum Normal file
View File

@ -0,0 +1,30 @@
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f h1:cWOyRTtBcTBjB0c+GyaQaXgP3g1HVM1KbvZL/Q5QNAM=
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-ole/go-ole v1.2.5-0.20190708054153-938323a72016/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM=
github.com/headzoo/surf v1.0.0 h1:d2h9ftKeQYj7tKqAjQtAA0lJVkO8cTxvzdXLynmNnHM=
github.com/headzoo/surf v1.0.0/go.mod h1:/bct0m/iMNEqpn520y01yoaWxsAEigGFPnvyR1ewR5M=
github.com/headzoo/ut v0.0.0-20181013193318-a13b5a7a02ca/go.mod h1:8926sG02TCOX4RFRzIMFIzRw4xuc/TwO2gtN7teMJZ4=
github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c h1:Jf20xV/yR/O6eSUqLTuXhka/+54YR59sGwN7b3MkxYk=
github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c/go.mod h1:oRXh43p/JW9kWosasd+2kHfDpb1ec4m7YrZ5E39s1iI=
github.com/pacificporter/surf v1.0.3 h1:hCAzcBQpTle3ZOiOXMpFW1v+S3iN7IlSX2N6d8RIyMo=
github.com/pacificporter/surf v1.0.3/go.mod h1:Fk2Km1poD/rB2o9a+1USaMgUX2lTVu/xC+swj6m/IAM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shirou/gopsutil v2.19.12+incompatible h1:WRstheAymn1WOPesh+24+bZKFkqrdCR8JOc77v4xV3Q=
github.com/shirou/gopsutil v2.19.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 h1:p9xBe/w/OzkeYVKm234g55gMdD1nSIooTir5kV11kfA=
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20191010194322-b09406accb47 h1:/XfQ9z7ib8eEJX2hdgFTZJ/ntt0swNk5oYBziWeTCvY=
golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/headzoo/surf.v1 v1.0.0 h1:Ti4LagTvHxSdHYHf5DTqJRhY4+pQYZ0slBPlxo2IWGU=
gopkg.in/headzoo/surf.v1 v1.0.0/go.mod h1:T0BH8276y+OPL0E4tisxCFjBVIAKGbwdYU7AS7/EpQQ=

203
main.go Normal file
View File

@ -0,0 +1,203 @@
package main
import (
"bufio"
"encoding/json"
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"regexp"
"sort"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/headzoo/surf"
"github.com/headzoo/surf/agent"
"github.com/headzoo/surf/browser"
)
type StoryEntry struct {
Author string `json:"author"`
Content string `json:"content"`
ModifiedDate string `json:"modifiedDate"`
StarredCount int `json:"starredCount"`
ReportedCount int `json:"reportedCount"`
}
type KanjiEntry struct {
Character string `json:"character"`
Reading string `json:"reading"`
FrameNumber int `json:"frameNumber"`
StrokeCount int `json:"strokeCount"`
Story string `json:"story"`
Stories StoryEntryList `json:"stories"`
}
type StoryEntryList []StoryEntry
func (e StoryEntryList) Len() int {
return len(e)
}
func (e StoryEntryList) Less(i, j int) bool {
return e[i].StarredCount > e[j].StarredCount
}
func (e StoryEntryList) Swap(i, j int) {
e[i], e[j] = e[j], e[i]
}
func login(br *browser.Browser, username, password string) error {
loginUrl := "http://kanji.koohii.com/login"
if err := br.Open(loginUrl); err != nil {
return err
}
fm, err := br.Form("form")
if err != nil {
return errors.New("login form not found")
}
if err := fm.Input("username", username); err != nil {
return err
}
if err := fm.Input("password", password); err != nil {
return err
}
if err := fm.Submit(); err != nil {
return err
}
if br.Title() == "Sign In - Kanji Koohii" {
return errors.New("failed to sign in")
}
return nil
}
func scrape(br *browser.Browser, lookup string) (*KanjiEntry, error) {
if err := br.Open(fmt.Sprintf("http://kanji.koohii.com/study/kanji/%s", lookup)); err != nil {
return nil, err
}
var kanji KanjiEntry
kanji.Character = strings.TrimSpace(br.Find("div.kanji span.cj-k").Text())
kanji.Reading = strings.TrimSpace(br.Find("div.strokecount span.cj-k").Text())
kanji.FrameNumber, _ = strconv.Atoi(strings.TrimSpace(br.Find("div.framenum").Text()))
kanji.StrokeCount, _ = strconv.Atoi(strings.Split(strings.TrimSpace(br.Find("div.strokecount").Text()), " ")[0])
if kanji.Story = strings.TrimSpace(br.Find("div#sv-textarea").Text()); kanji.Story == "[ click here to enter your story ]" {
kanji.Story = ""
}
if matches := regexp.MustCompile(`\[(\d+)\]`).FindStringSubmatch(br.Find("div.strokecount").Text()); matches != nil {
kanji.StrokeCount, _ = strconv.Atoi(matches[1])
}
br.Find("div.sharedstory").Each(func(i int, s *goquery.Selection) {
var story StoryEntry
story.Author = strings.TrimSpace(s.Find("div.sharedstory_author a").Text())
story.Content = strings.TrimSpace(s.Find("div.story").Text())
story.ModifiedDate = strings.TrimSpace(s.Find("div.lastmodified").Text())
story.StarredCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsStar").Text()))
story.ReportedCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsReport").Text()))
kanji.Stories = append(kanji.Stories, story)
})
sort.Sort(kanji.Stories)
return &kanji, nil
}
func load(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, nil
}
func save(path string, kanjiList []*KanjiEntry) error {
data, err := json.MarshalIndent(kanjiList, "", " ")
if err != nil {
return err
}
if err := ioutil.WriteFile(path, data, 0644); err != nil {
return err
}
return nil
}
func main() {
var (
username = flag.String("username", "", "login username for kanji.koohii.com")
password = flag.String("password", "", "login password for kanji.koohii.com")
firstFrame = flag.Int("firstFrame", 1, "kanji first frame")
lastFrame = flag.Int("lastFrame", 3030, "kanji last frame")
)
flag.Parse()
args := flag.Args()
if len(*username) == 0 || len(*password) == 0 || len(args) == 0 || *firstFrame >= *lastFrame {
flag.Usage()
os.Exit(2)
}
br := surf.NewBrowser()
br.SetUserAgent(agent.Firefox())
br.AddRequestHeader("Accept", "text/html")
br.AddRequestHeader("Accept-Charset", "utf8")
log.Println("logging in...")
if err := login(br, *username, *password); err != nil {
log.Fatal(err)
}
var lookups []string
if len(args) >= 2 {
log.Printf("loading from %s...", args[1])
var err error
if lookups, err = load(args[1]); err != nil {
log.Fatal(err)
}
} else {
for i := *firstFrame; i <= *lastFrame; i++ {
lookups = append(lookups, strconv.Itoa(i))
}
}
var kanjiList []*KanjiEntry
for _, lookup := range lookups {
log.Printf("scraping %s...", lookup)
kanji, err := scrape(br, lookup)
if err != nil {
log.Fatal(err)
}
kanjiList = append(kanjiList, kanji)
time.Sleep(2 * time.Second)
}
log.Printf("saving to %s...", args[0])
if err := save(args[0], kanjiList); err != nil {
log.Fatal(err)
}
}