From 85d5d1bc363601fd826f97e3812ad324853fc61a Mon Sep 17 00:00:00 2001 From: Eduard Urbach Date: Thu, 8 Mar 2018 21:00:51 +0100 Subject: [PATCH] Improved crawler --- jobs/crawler/crawler.go | 46 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/jobs/crawler/crawler.go b/jobs/crawler/crawler.go index 1696283e..12d6a90c 100644 --- a/jobs/crawler/crawler.go +++ b/jobs/crawler/crawler.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "os" "time" "github.com/animenotifier/arn" @@ -10,13 +11,17 @@ import ( "github.com/aerogo/crawler" ) -const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.166 Safari/537.36" +const ( + // The maximum age of files we accept until we force a refresh. + maxAge = 30 * 24 * time.Hour + userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.166 Safari/537.36" +) func main() { defer arn.Node.Close() - malCrawler := crawler.New(userAgent, 1*time.Second, 20000) - count := 0 + // Filter anime with MAL ID + animes := []*arn.Anime{} for anime := range arn.StreamAnime() { malID := anime.GetMapping("myanimelist/anime") @@ -25,9 +30,40 @@ func main() { continue } + animes = append(animes, anime) + } + + color.Yellow("Found %d anime", len(animes)) + + // Create crawler + malCrawler := crawler.New( + map[string]string{ + "User-Agent": userAgent, + }, + 1500*time.Millisecond, + len(animes), + ) + + // Sort so that we download the most important ones first + arn.SortAnimeByQuality(animes, "") + + // Queue up URLs + count := 0 + + for _, anime := range animes { + malID := anime.GetMapping("myanimelist/anime") + url := "https://myanimelist.net/anime/" + malID + filePath := fmt.Sprintf("mal/anime-%s.html", malID) + fileInfo, err := os.Stat(filePath) + + if err == nil && time.Since(fileInfo.ModTime()) <= maxAge { + // fmt.Println(color.YellowString(url), "skip") + continue + } + malCrawler.Queue(&crawler.Task{ - URL: "https://myanimelist.net/anime/" + malID, - Destination: fmt.Sprintf("mal/anime-%s.html", malID), + URL: url, + Destination: filePath, }) count++