219 lines
5.3 KiB
Go
219 lines
5.3 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/headzoo/surf"
|
|
"github.com/headzoo/surf/agent"
|
|
"github.com/headzoo/surf/browser"
|
|
)
|
|
|
|
type StoryEntry struct {
|
|
Author string `json:"author"`
|
|
Content string `json:"content"`
|
|
ModifiedDate string `json:"modifiedDate"`
|
|
StarredCount int `json:"starredCount"`
|
|
ReportedCount int `json:"reportedCount"`
|
|
}
|
|
|
|
type KanjiEntry struct {
|
|
Character string `json:"character"`
|
|
Reading string `json:"reading"`
|
|
FrameNumber int `json:"frameNumber"`
|
|
StrokeCount int `json:"strokeCount"`
|
|
Story string `json:"story"`
|
|
Stories []StoryEntry `json:"stories"`
|
|
}
|
|
|
|
func login(br *browser.Browser, username, password string) error {
|
|
loginUrl := "http://kanji.koohii.com/login"
|
|
if err := br.Open(loginUrl); err != nil {
|
|
return err
|
|
}
|
|
|
|
fm, err := br.Form("form")
|
|
if err != nil {
|
|
return errors.New("login form not found")
|
|
}
|
|
|
|
if err := fm.Input("username", username); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := fm.Input("password", password); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := fm.Submit(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if br.Title() == "Sign In - Kanji Koohii" {
|
|
return errors.New("failed to sign in")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func scrape(br *browser.Browser, lookup string) (*KanjiEntry, error) {
|
|
if err := br.Open(fmt.Sprintf("http://kanji.koohii.com/study/kanji/%s", lookup)); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var kanji KanjiEntry
|
|
kanji.Character = strings.TrimSpace(br.Find("div.kanji span.cj-k").Text())
|
|
kanji.Reading = strings.TrimSpace(br.Find("div.strokecount span.cj-k").Text())
|
|
kanji.FrameNumber, _ = strconv.Atoi(strings.TrimSpace(br.Find("div.framenum").Text()))
|
|
kanji.StrokeCount, _ = strconv.Atoi(strings.Split(strings.TrimSpace(br.Find("div.strokecount").Text()), " ")[0])
|
|
|
|
if kanji.Story = strings.TrimSpace(br.Find("div#sv-textarea").Text()); kanji.Story == "[ click here to enter your story ]" {
|
|
kanji.Story = ""
|
|
}
|
|
|
|
if matches := regexp.MustCompile(`\[(\d+)\]`).FindStringSubmatch(br.Find("div.strokecount").Text()); matches != nil {
|
|
kanji.StrokeCount, _ = strconv.Atoi(matches[1])
|
|
}
|
|
|
|
br.Find("div#sharedstories-fav div.sharedstory").Each(func(i int, s *goquery.Selection) {
|
|
var story StoryEntry
|
|
story.Author = strings.TrimSpace(s.Find("div.sharedstory_author a:first-of-type").Text())
|
|
story.Content = strings.TrimSpace(s.Find("div.story").Text())
|
|
story.ModifiedDate = strings.TrimSpace(s.Find("div.lastmodified").Text())
|
|
story.StarredCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsStar").Text()))
|
|
story.ReportedCount, _ = strconv.Atoi(strings.TrimSpace(s.Find("a.JsReport").Text()))
|
|
kanji.Stories = append(kanji.Stories, story)
|
|
})
|
|
|
|
sort.Slice(kanji.Stories, func(i, j int) bool {
|
|
return kanji.Stories[i].StarredCount > kanji.Stories[j].StarredCount
|
|
})
|
|
|
|
return &kanji, nil
|
|
}
|
|
|
|
func load(path string) ([]string, error) {
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
|
|
var lines []string
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
lines = append(lines, scanner.Text())
|
|
}
|
|
|
|
return lines, nil
|
|
}
|
|
|
|
func save(path string, kanjiList []*KanjiEntry) error {
|
|
data, err := json.MarshalIndent(kanjiList, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.WriteFile(path, data, 0644); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func main() {
|
|
var (
|
|
firstFrame = flag.Int("firstFrame", 1, "kanji first frame")
|
|
lastFrame = flag.Int("lastFrame", 3030, "kanji last frame")
|
|
retryCount = flag.Int("retryCount", 3, "scrape retry count")
|
|
scrapeDelay = flag.Int("scrapeDelay", 2, "scrape delay in seconds")
|
|
)
|
|
|
|
flag.Usage = func() {
|
|
fmt.Fprintf(os.Stderr, "Usage: %s [options] <stories.json> [kanji.txt]\n", filepath.Base(os.Args[0]))
|
|
fmt.Fprintln(os.Stderr, "Options:")
|
|
flag.PrintDefaults()
|
|
}
|
|
|
|
flag.Parse()
|
|
|
|
args := flag.Args()
|
|
if len(args) == 0 || *firstFrame > *lastFrame {
|
|
flag.Usage()
|
|
os.Exit(2)
|
|
}
|
|
|
|
username, ok := os.LookupEnv("RTK_USER")
|
|
if !ok {
|
|
log.Fatalf("environment variable RTK_USER environment is not set")
|
|
}
|
|
|
|
password, ok := os.LookupEnv("RTK_PASS")
|
|
if !ok {
|
|
log.Fatalf("environment variable RTK_PASS environment is not set")
|
|
}
|
|
|
|
br := surf.NewBrowser()
|
|
br.SetUserAgent(agent.Firefox())
|
|
br.AddRequestHeader("Accept", "text/html")
|
|
br.AddRequestHeader("Accept-Charset", "utf8")
|
|
|
|
log.Println("logging in...")
|
|
if err := login(br, username, password); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
var lookups []string
|
|
if len(args) >= 2 {
|
|
log.Printf("loading from %s...", args[1])
|
|
var err error
|
|
if lookups, err = load(args[1]); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
} else {
|
|
for i := *firstFrame; i <= *lastFrame; i++ {
|
|
lookups = append(lookups, strconv.Itoa(i))
|
|
}
|
|
}
|
|
|
|
var kanjiList []*KanjiEntry
|
|
for _, lookup := range lookups {
|
|
var kanji *KanjiEntry
|
|
var err error
|
|
|
|
for i := 0; i < *retryCount; i++ {
|
|
log.Printf("scraping %s...", lookup)
|
|
|
|
kanji, err = scrape(br, lookup)
|
|
time.Sleep(time.Second * time.Duration(*scrapeDelay))
|
|
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
if err == nil {
|
|
kanjiList = append(kanjiList, kanji)
|
|
} else {
|
|
log.Fatal(err)
|
|
}
|
|
}
|
|
|
|
log.Printf("saving to %s...", args[0])
|
|
if err := save(args[0], kanjiList); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|