Initial commit
This commit is contained in:
commit
95232a44b9
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
rtk-scrape*
|
10
go.mod
Normal file
10
go.mod
Normal file
@ -0,0 +1,10 @@
|
||||
module github.com/FooSoft/rtk-scrape
|
||||
|
||||
go 1.13
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f
|
||||
github.com/headzoo/surf v1.0.0
|
||||
github.com/pacificporter/surf v1.0.3
|
||||
gopkg.in/headzoo/surf.v1 v1.0.0
|
||||
)
|
30
go.sum
Normal file
30
go.sum
Normal file
@ -0,0 +1,30 @@
|
||||
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f h1:cWOyRTtBcTBjB0c+GyaQaXgP3g1HVM1KbvZL/Q5QNAM=
|
||||
github.com/PuerkitoBio/goquery v1.5.1-0.20190109230704-3dcf72e6c17f/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||
github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
|
||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-ole/go-ole v1.2.5-0.20190708054153-938323a72016/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM=
|
||||
github.com/headzoo/surf v1.0.0 h1:d2h9ftKeQYj7tKqAjQtAA0lJVkO8cTxvzdXLynmNnHM=
|
||||
github.com/headzoo/surf v1.0.0/go.mod h1:/bct0m/iMNEqpn520y01yoaWxsAEigGFPnvyR1ewR5M=
|
||||
github.com/headzoo/ut v0.0.0-20181013193318-a13b5a7a02ca/go.mod h1:8926sG02TCOX4RFRzIMFIzRw4xuc/TwO2gtN7teMJZ4=
|
||||
github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c h1:Jf20xV/yR/O6eSUqLTuXhka/+54YR59sGwN7b3MkxYk=
|
||||
github.com/itchio/go-brotli v0.0.0-20190702114328-3f28d645a45c/go.mod h1:oRXh43p/JW9kWosasd+2kHfDpb1ec4m7YrZ5E39s1iI=
|
||||
github.com/pacificporter/surf v1.0.3 h1:hCAzcBQpTle3ZOiOXMpFW1v+S3iN7IlSX2N6d8RIyMo=
|
||||
github.com/pacificporter/surf v1.0.3/go.mod h1:Fk2Km1poD/rB2o9a+1USaMgUX2lTVu/xC+swj6m/IAM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/shirou/gopsutil v2.19.12+incompatible h1:WRstheAymn1WOPesh+24+bZKFkqrdCR8JOc77v4xV3Q=
|
||||
github.com/shirou/gopsutil v2.19.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 h1:p9xBe/w/OzkeYVKm234g55gMdD1nSIooTir5kV11kfA=
|
||||
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20191010194322-b09406accb47 h1:/XfQ9z7ib8eEJX2hdgFTZJ/ntt0swNk5oYBziWeTCvY=
|
||||
golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
gopkg.in/headzoo/surf.v1 v1.0.0 h1:Ti4LagTvHxSdHYHf5DTqJRhY4+pQYZ0slBPlxo2IWGU=
|
||||
gopkg.in/headzoo/surf.v1 v1.0.0/go.mod h1:T0BH8276y+OPL0E4tisxCFjBVIAKGbwdYU7AS7/EpQQ=
|
203
main.go
Normal file
203
main.go
Normal file
@ -0,0 +1,203 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/headzoo/surf"
|
||||
"github.com/headzoo/surf/agent"
|
||||
"github.com/headzoo/surf/browser"
|
||||
)
|
||||
|
||||
type StoryEntry struct {
|
||||
Author string `json:"author"`
|
||||
Content string `json:"content"`
|
||||
ModifiedDate string `json:"modifiedDate"`
|
||||
StarredCount int `json:"starredCount"`
|
||||
ReportedCount int `json:"reportedCount"`
|
||||
}
|
||||
|
||||
type KanjiEntry struct {
|
||||
Character string `json:"character"`
|
||||
Reading string `json:"reading"`
|
||||
FrameNumber int `json:"frameNumber"`
|
||||
StrokeCount int `json:"strokeCount"`
|
||||
Story string `json:"story"`
|
||||
Stories StoryEntryList `json:"stories"`
|
||||
}
|
||||
|
||||
type StoryEntryList []StoryEntry
|
||||
|
||||
func (e StoryEntryList) Len() int {
|
||||
return len(e)
|
||||
}
|
||||
|
||||
func (e StoryEntryList) Less(i, j int) bool {
|
||||
return e[i].StarredCount > e[j].StarredCount
|
||||
}
|
||||
|
||||
func (e StoryEntryList) Swap(i, j int) {
|
||||
e[i], e[j] = e[j], e[i]
|
||||
}
|
||||
|
||||
func login(br *browser.Browser, username, password string) error {
|
||||
loginUrl := "http://kanji.koohii.com/login"
|
||||
if err := br.Open(loginUrl); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fm, err := br.Form("form")
|
||||
if err != nil {
|
||||
return errors.New("login form not found")
|
||||
}
|
||||
|
||||
if err := fm.Input("username", username); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := fm.Input("password", password); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := fm.Submit(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if br.Title() == "Sign In - Kanji Koohii" {
|
||||
return errors.New("failed to sign in")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func scrape(br *browser.Browser, lookup string) (*KanjiEntry, error) {
|
||||
if err := br.Open(fmt.Sprintf("http://kanji.koohii.com/study/kanji/%s", lookup)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var kanji KanjiEntry
|
||||
kanji.Character = strings.TrimSpace(br.Find("div.kanji span.cj-k").Text())
|
||||
kanji.Reading = strings.TrimSpace(br.Find("div.strokecount span.cj-k").Text())
|
||||
kanji.FrameNumber, _ = strconv.Atoi(strings.TrimSpace(br.Find("div.framenum").Text()))
|
||||
kanji.StrokeCount, _ = strconv.Atoi(strings.Split(strings.TrimSpace(br.Find("div.strokecount").Text()), " ")[0])
|
||||
|
||||
if kanji.Story = strings.TrimSpace(br.Find("div#sv-textarea").Text()); kanji.Story == "[ click here to enter your story ]" {
|
||||
kanji.Story = ""
|
||||
}
|
||||
|
||||
if matches := regexp.MustCompile(`\[(\d+)\]`).FindStringSubmatch(br.Find("div.strokecount").Text()); matches != nil {
|
||||
kanji.StrokeCount, _ = strconv.Atoi(matches[1])
|
||||
}
|
||||
|
||||
br.Find("div.sharedstory").Each(func(i int, s *goquery.Selection) {
|
||||
var story StoryEntry
|
||||
story.Author = strings.TrimSpace(s.Find("div.sharedstory_author a").Text())
|
||||
story.Content = strings.TrimSpace(s.Find("div.story").Text())
|
||||
story.ModifiedDate = strings.TrimSpace(s.Find("div.lastmodified").Text())
|
||||
story.StarredCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsStar").Text()))
|
||||
story.ReportedCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsReport").Text()))
|
||||
kanji.Stories = append(kanji.Stories, story)
|
||||
})
|
||||
|
||||
sort.Sort(kanji.Stories)
|
||||
|
||||
return &kanji, nil
|
||||
}
|
||||
|
||||
func load(path string) ([]string, error) {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var lines []string
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
lines = append(lines, scanner.Text())
|
||||
}
|
||||
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
func save(path string, kanjiList []*KanjiEntry) error {
|
||||
data, err := json.MarshalIndent(kanjiList, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := ioutil.WriteFile(path, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
var (
|
||||
username = flag.String("username", "", "login username for kanji.koohii.com")
|
||||
password = flag.String("password", "", "login password for kanji.koohii.com")
|
||||
firstFrame = flag.Int("firstFrame", 1, "kanji first frame")
|
||||
lastFrame = flag.Int("lastFrame", 3030, "kanji last frame")
|
||||
)
|
||||
|
||||
flag.Parse()
|
||||
|
||||
args := flag.Args()
|
||||
if len(*username) == 0 || len(*password) == 0 || len(args) == 0 || *firstFrame >= *lastFrame {
|
||||
flag.Usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
br := surf.NewBrowser()
|
||||
br.SetUserAgent(agent.Firefox())
|
||||
br.AddRequestHeader("Accept", "text/html")
|
||||
br.AddRequestHeader("Accept-Charset", "utf8")
|
||||
|
||||
log.Println("logging in...")
|
||||
if err := login(br, *username, *password); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
var lookups []string
|
||||
if len(args) >= 2 {
|
||||
log.Printf("loading from %s...", args[1])
|
||||
var err error
|
||||
if lookups, err = load(args[1]); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
} else {
|
||||
for i := *firstFrame; i <= *lastFrame; i++ {
|
||||
lookups = append(lookups, strconv.Itoa(i))
|
||||
}
|
||||
}
|
||||
|
||||
var kanjiList []*KanjiEntry
|
||||
for _, lookup := range lookups {
|
||||
log.Printf("scraping %s...", lookup)
|
||||
kanji, err := scrape(br, lookup)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
kanjiList = append(kanjiList, kanji)
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
|
||||
log.Printf("saving to %s...", args[0])
|
||||
if err := save(args[0], kanjiList); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user