Improved crawler

This commit is contained in:
Eduard Urbach 2018-03-08 21:00:51 +01:00
parent 13ad0b78da
commit 85d5d1bc36

View File

@ -2,6 +2,7 @@ package main
import ( import (
"fmt" "fmt"
"os"
"time" "time"
"github.com/animenotifier/arn" "github.com/animenotifier/arn"
@ -10,13 +11,17 @@ import (
"github.com/aerogo/crawler" "github.com/aerogo/crawler"
) )
const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.166 Safari/537.36" const (
// The maximum age of files we accept until we force a refresh.
maxAge = 30 * 24 * time.Hour
userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.166 Safari/537.36"
)
func main() { func main() {
defer arn.Node.Close() defer arn.Node.Close()
malCrawler := crawler.New(userAgent, 1*time.Second, 20000) // Filter anime with MAL ID
count := 0 animes := []*arn.Anime{}
for anime := range arn.StreamAnime() { for anime := range arn.StreamAnime() {
malID := anime.GetMapping("myanimelist/anime") malID := anime.GetMapping("myanimelist/anime")
@ -25,9 +30,40 @@ func main() {
continue continue
} }
animes = append(animes, anime)
}
color.Yellow("Found %d anime", len(animes))
// Create crawler
malCrawler := crawler.New(
map[string]string{
"User-Agent": userAgent,
},
1500*time.Millisecond,
len(animes),
)
// Sort so that we download the most important ones first
arn.SortAnimeByQuality(animes, "")
// Queue up URLs
count := 0
for _, anime := range animes {
malID := anime.GetMapping("myanimelist/anime")
url := "https://myanimelist.net/anime/" + malID
filePath := fmt.Sprintf("mal/anime-%s.html", malID)
fileInfo, err := os.Stat(filePath)
if err == nil && time.Since(fileInfo.ModTime()) <= maxAge {
// fmt.Println(color.YellowString(url), "skip")
continue
}
malCrawler.Queue(&crawler.Task{ malCrawler.Queue(&crawler.Task{
URL: "https://myanimelist.net/anime/" + malID, URL: url,
Destination: fmt.Sprintf("mal/anime-%s.html", malID), Destination: filePath,
}) })
count++ count++