From a559227b656b90cab14a9e282aa17d8d3060754f Mon Sep 17 00:00:00 2001 From: Eduard Urbach Date: Tue, 30 Oct 2018 08:59:35 +0900 Subject: [PATCH] Updated MAL tools --- jobs/mal-download/mal-download.go | 8 +++++--- jobs/mal-parse/mal-parse.go | 13 +++++++++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/jobs/mal-download/mal-download.go b/jobs/mal-download/mal-download.go index b0d5262b..c256da24 100644 --- a/jobs/mal-download/mal-download.go +++ b/jobs/mal-download/mal-download.go @@ -15,7 +15,8 @@ const ( // The maximum age of files we accept until we force a refresh. maxAge = 24 * time.Hour delayBetweenRequests = 1100 * time.Millisecond - userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" + userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.20 Safari/537.36" + animeDirectory = "anime" ) var headers = map[string]string{ @@ -50,7 +51,7 @@ func main() { arn.Node.Close() // Create anime directory if it's missing - os.Mkdir("anime", 0777) + os.Mkdir(animeDirectory, 0777) // Create crawler malCrawler := crawler.New( @@ -80,7 +81,7 @@ func main() { func queue(anime *arn.Anime, malCrawler *crawler.Crawler) { malID := anime.GetMapping("myanimelist/anime") url := "https://myanimelist.net/anime/" + malID - filePath := fmt.Sprintf("anime/anime-%s.html", malID) + filePath := fmt.Sprintf("%s/%s.html.gz", animeDirectory, malID) fileInfo, err := os.Stat(filePath) if err == nil && time.Since(fileInfo.ModTime()) <= maxAge { @@ -91,5 +92,6 @@ func queue(anime *arn.Anime, malCrawler *crawler.Crawler) { malCrawler.Queue(&crawler.Task{ URL: url, Destination: filePath, + Raw: true, }) } diff --git a/jobs/mal-parse/mal-parse.go b/jobs/mal-parse/mal-parse.go index d0a170de..3df92f9d 100644 --- a/jobs/mal-parse/mal-parse.go +++ b/jobs/mal-parse/mal-parse.go @@ -1,6 +1,7 @@ package main import ( + "compress/gzip" "errors" "fmt" "os" @@ -36,7 +37,7 @@ func main() { return nil } - if !strings.HasSuffix(name, ".html") { + if !strings.HasSuffix(name, ".html.gz") { return nil } @@ -53,7 +54,15 @@ func readFile(name string) error { } defer file.Close() - anime, characters, err := malparser.ParseAnime(file) + + reader, err := gzip.NewReader(file) + + if err != nil { + fmt.Println(err) + return err + } + + anime, characters, err := malparser.ParseAnime(reader) if err != nil { fmt.Println(err)