rtk-scrape/main.go
2024-04-01 21:51:47 -07:00

219 lines
5.3 KiB
Go

package main
import (
"bufio"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/headzoo/surf"
"github.com/headzoo/surf/agent"
"github.com/headzoo/surf/browser"
)
type StoryEntry struct {
Author string `json:"author"`
Content string `json:"content"`
ModifiedDate string `json:"modifiedDate"`
StarredCount int `json:"starredCount"`
ReportedCount int `json:"reportedCount"`
}
type KanjiEntry struct {
Character string `json:"character"`
Reading string `json:"reading"`
FrameNumber int `json:"frameNumber"`
StrokeCount int `json:"strokeCount"`
Story string `json:"story"`
Stories []StoryEntry `json:"stories"`
}
func login(br *browser.Browser, username, password string) error {
loginUrl := "http://kanji.koohii.com/login"
if err := br.Open(loginUrl); err != nil {
return err
}
fm, err := br.Form("form")
if err != nil {
return errors.New("login form not found")
}
if err := fm.Input("username", username); err != nil {
return err
}
if err := fm.Input("password", password); err != nil {
return err
}
if err := fm.Submit(); err != nil {
return err
}
if br.Title() == "Sign In - Kanji Koohii" {
return errors.New("failed to sign in")
}
return nil
}
func scrape(br *browser.Browser, lookup string) (*KanjiEntry, error) {
if err := br.Open(fmt.Sprintf("http://kanji.koohii.com/study/kanji/%s", lookup)); err != nil {
return nil, err
}
var kanji KanjiEntry
kanji.Character = strings.TrimSpace(br.Find("div.kanji span.cj-k").Text())
kanji.Reading = strings.TrimSpace(br.Find("div.strokecount span.cj-k").Text())
kanji.FrameNumber, _ = strconv.Atoi(strings.TrimSpace(br.Find("div.framenum").Text()))
kanji.StrokeCount, _ = strconv.Atoi(strings.Split(strings.TrimSpace(br.Find("div.strokecount").Text()), " ")[0])
if kanji.Story = strings.TrimSpace(br.Find("div#sv-textarea").Text()); kanji.Story == "[ click here to enter your story ]" {
kanji.Story = ""
}
if matches := regexp.MustCompile(`\[(\d+)\]`).FindStringSubmatch(br.Find("div.strokecount").Text()); matches != nil {
kanji.StrokeCount, _ = strconv.Atoi(matches[1])
}
br.Find("div#sharedstories-fav div.sharedstory").Each(func(i int, s *goquery.Selection) {
var story StoryEntry
story.Author = strings.TrimSpace(s.Find("div.sharedstory_author a:first-of-type").Text())
story.Content = strings.TrimSpace(s.Find("div.story").Text())
story.ModifiedDate = strings.TrimSpace(s.Find("div.lastmodified").Text())
story.StarredCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsStar").Text()))
story.ReportedCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsReport").Text()))
kanji.Stories = append(kanji.Stories, story)
})
sort.Slice(kanji.Stories, func(i, j int) bool {
return kanji.Stories[i].StarredCount > kanji.Stories[j].StarredCount
})
return &kanji, nil
}
func load(path string) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines, nil
}
func save(path string, kanjiList []*KanjiEntry) error {
data, err := json.MarshalIndent(kanjiList, "", " ")
if err != nil {
return err
}
if err := os.WriteFile(path, data, 0644); err != nil {
return err
}
return nil
}
func main() {
var (
firstFrame = flag.Int("firstFrame", 1, "kanji first frame")
lastFrame = flag.Int("lastFrame", 3030, "kanji last frame")
retryCount = flag.Int("retryCount", 3, "scrape retry count")
scrapeDelay = flag.Int("scrapeDelay", 2, "scrape delay in seconds")
)
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [options] <stories.json> [kanji.txt]\n", filepath.Base(os.Args[0]))
fmt.Fprintln(os.Stderr, "Options:")
flag.PrintDefaults()
}
flag.Parse()
args := flag.Args()
if len(args) == 0 || *firstFrame > *lastFrame {
flag.Usage()
os.Exit(2)
}
username, ok := os.LookupEnv("RTK_USER")
if !ok {
log.Fatalf("environment variable RTK_USER environment is not set")
}
password, ok := os.LookupEnv("RTK_PASS")
if !ok {
log.Fatalf("environment variable RTK_PASS environment is not set")
}
br := surf.NewBrowser()
br.SetUserAgent(agent.Firefox())
br.AddRequestHeader("Accept", "text/html")
br.AddRequestHeader("Accept-Charset", "utf8")
log.Println("logging in...")
if err := login(br, username, password); err != nil {
log.Fatal(err)
}
var lookups []string
if len(args) >= 2 {
log.Printf("loading from %s...", args[1])
var err error
if lookups, err = load(args[1]); err != nil {
log.Fatal(err)
}
} else {
for i := *firstFrame; i <= *lastFrame; i++ {
lookups = append(lookups, strconv.Itoa(i))
}
}
var kanjiList []*KanjiEntry
for _, lookup := range lookups {
var kanji *KanjiEntry
var err error
for i := 0; i < *retryCount; i++ {
log.Printf("scraping %s...", lookup)
kanji, err = scrape(br, lookup)
time.Sleep(time.Second * time.Duration(*scrapeDelay))
if err == nil {
break
}
}
if err == nil {
kanjiList = append(kanjiList, kanji)
} else {
log.Fatal(err)
}
}
log.Printf("saving to %s...", args[0])
if err := save(args[0], kanjiList); err != nil {
log.Fatal(err)
}
}