发现了一个网站 https://wall.alphacoders.com/ ,图片质量挺高,正好最近在学习go,就用go下载图片到本地
我记录一下,以便以后查看,代码肯定是有所不足,大家别笑话,看看就好。
[起始页截图 https://wall.alphacoders.com/finding_wallpapers.php ]
1、https://wall.alphacoders.com/finding_wallpapers.php 作为入口地址
2、根据入口地址获取每个分类的title和href
3、处理每个分类:进入分类页面,获取到最后一页的地址
4、知道最后一页的地址,从第一页循环到最后一页,获取每页的数据
5、循环匹配每页中原图的地址(还是个页面,并非图片)
6、在原图地址中获取待下载的图片地址:$(’.img.main-content’).attr(‘src’)
7、保存图片到本地,如果已保存,则不重复保存(注意:如果程序中途结束,有些图片是不完整的,不完整的图片是不会再次下载的)
【注意】:如果网页的标签有所变动,代码肯定是会报错的
【延伸】:
如果 goquery 模块不存在,cmd中执行 [ go get github.com/opesun/goquery ] 即可
这个html解析器好像不是很强,可以用正则匹配地址或者用其他的html解析器
这个可以优化一下,用多线程执行
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
//"github.com/PuerkitoBio/goquery" //这个模块配置环境经常出错就没有选择,我这里显示该模块某些方法已弃用
"github.com/opesun/goquery"
)
const DOMAIN string = "https://wall.alphacoders.com/"
func main() {
start() //下载所有
//自定义分类下载:分类数据在 log.txt 文件中获取
/*custom := [10][2]string{
{"41276Women", "https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers"},
{"1545Game", "https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers"}}
customStart(custom)*/
}
//自定义分类链接下载
func customStart(custom [10][2]string) {
fmt.Println("开始下载...")
for _, value := range custom{
if len(value) == 0 { //为空,跳过后面的步骤,执行下一个循环
continue
}
//获取键(标题)值(url)地址
title := value[0]
href := value[1]
dir := createDir(title) //创建文件夹名为分类的文件夹
handleOneUrl(href, dir) //处理单个分类的数据
}
fmt.Println("结束下载!!!")
}
func start(){
fmt.Println("开始下载...")
titleToUrls := getUrls() //获取到title - href 的map数据
//fmt.Println(titleToUrls)
//循环,以title为文件夹名,处理每个分类下面的分页数据
for _, value := range titleToUrls{
if len(value) == 0 { //一维数组为空,跳过后面的步骤,执行下一个循环
continue
}
//获取键(标题)值(url)地址
title := value[0]
href := value[1]
dir := createDir(title)
handleOneUrl(href, dir)
}
fmt.Println("结束下载!!!")
}
//处理单个分类URL的数据(包含分页数据)
func handleOneUrl(url string, dir string){
doc, err := goquery.ParseUrl(url)
if err != nil {//hidden-sm
fmt.Println("error to get url html:", err.Error())
return
}
liList := doc.Find("li") //匹配到这个分类页面里所有li标签
lastUrl := ""
length := liList.Length()
isLast := false //是否是最后一页,我们认为在 ... 后面的一个url是最后一页
// 这个for循环就是动态获取分类最后一页的url地址
for i := 0; i < length; i++{ //获取到最后一页的ID
text := liList.Find("a").Eq(i).Text()
href := liList.Find("a").Eq(i).Attr("href")
if isLast { //isLast为true,是最后一页,保留最后一页的Url,结束本次循环
lastUrl = href
break
}
if strings.Index(text, "...") != -1 { //...存在,认为下一个数据为最后一页,修改isLast为true
isLast = true
}
}
//如果lastUrl为空,返回,结束该分类的后续操作
if lastUrl == "" {
fmt.Println("Not Found Last Page:", url)
return
}
//把该分类最后一页的地址 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=474 拆分为 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page= 和 474
lastEqIndex := strings.LastIndex(lastUrl, "=") //获取最后一个 = 在url中的索引
startUrl := lastUrl[:lastEqIndex+1] //https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=
lastPage := lastUrl[lastEqIndex+1:] //最后一页:474
//循环处理单个分类的每一页数据
handleOnePage(startUrl, lastPage, dir)
}
//循环处理单个分类的每一页数据
func handleOnePage(startUrl string, lastPage string, dir string){
endPage,_ := strconv.Atoi(lastPage) //string转int
for i := 1; i <= endPage; i++ { //每页数据,页码为 [1, lastPage]
pageUrl := startUrl + strconv.Itoa(i) //获取每页的url
pageDoc, err := goquery.ParseUrl(pageUrl)
if err != nil { //如果该页数据不存在,则打印日志,继续执行下一页
fmt.Println("PageUrl error:", err.Error())
continue
}
//获取该的所有图片原图页面
urls := pageDoc.Find("div.boxgrid")
length := urls.Length()
for i:=0; i < length; i++ {
suffixUrl := urls.Find("a").Eq(i).Attr("href") //获取每张图片原图页面(原图地址在原图页面中):big.php?i=1079534
detailUrl := DOMAIN + suffixUrl //大图的URL地址
//获取最后大图的地址
pageDetailDoc, err := goquery.ParseUrl(detailUrl)
if err != nil {
fmt.Println("PageDetailUrl error:", err.Error())
continue
}
//获取最终URL:https://images8.alphacoders.com/107/thumb-1920-1079562.jpg
finalUrl := pageDetailDoc.Find("img.main-content").Attr("src") //获取每张图片原图地址:把该地址的图片下载保存到本地
//设置文件名:获取 https://images8.alphacoders.com/107/thumb-1920-1079562.jpg 的 thumb-1920-1079562.jpg 作为文件名
lastEqIndex := strings.LastIndex(finalUrl, "/")
fileName := finalUrl[lastEqIndex+1:] //最后一个/后面的内容
savePath := dir + "/" + fileName //保存到本地的地址:地址 + / + 图片名
saveFile(finalUrl, savePath) //下载图片到本地
}
}
}
//保存文件操作(如果程序被强行结束,下载的某些图片可能不完整)
func saveFile(finalUrl string, savePath string){
//判断文件是否存在,如果存在,说明已经下载了,直接return,下载下一张图片
_, err := os.Stat(savePath)
if err == nil {
fmt.Printf("图片 [ %s ] 已下载\n", finalUrl)
return
}
//创建文件:如果文件已存在,会将文件清空
file, err := os.Create(savePath)
if err != nil {
fmt.Println("createFile error:", err.Error())
return
}
defer file.Close()
//读取url的信息,存入到文件
resp, err := http.Get(finalUrl)
if err != nil {
fmt.Println("imageGet error:", err.Error())
return
}
defer file.Close()
buf := make([]byte, 4096)
for {
res, err2 := resp.Body.Read(buf)
if res == 0{
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
//写入文件
file.Write(buf[:res])
}
//循环结束,我们认为图片已经下载成功,控制台输出提示
fmt.Printf("图片 [ %s ] 下载 [ %s ] 成功\n", finalUrl, savePath)
}
//创建名为dirName的文件夹,用于存放改分类的图片
func createDir(dirName string) string{
dir := "./images/" + dirName
//如果文件夹不存在,则创建
_, err := os.Stat(dir)
if err != nil {
//os.Mkdir(dir, os.ModePerm) //创建单个文件夹
os.MkdirAll(dir, os.ModePerm) //递归创建文件夹
}
return dir
}
//获取title,href二维数组结果数据
func getUrls() [100][2]string{
url := "https://wall.alphacoders.com/finding_wallpapers.php" //搜索页面作为入口
doc, err := goquery.ParseUrl(url)
if err != nil { //如果入口页数据获取失败,直接退出程序
fmt.Println("SearchPage error:", err)
os.Exit(0) //结束程序
}
//定义二维数组,接收每个分类的title,href
result := [100][2]string{}
urls := doc.Find("a.list-group-item") //匹配入口页的url所在的标签
length := urls.Length() //获取匹配到的标签长度,用于循环;urls.Length()可以直接写在for中,但是那样的话每次循环都要执行一次该函数,影响效率
for i:=0; i < length; i++{
text := urls.Eq(i).Text() //匹配到html标签里的text内容
text = strings.Replace(text, " ", "", -1) //去除text内容的空格
text = strings.Replace(text, "\n", "", -1) //去除text内容的换行
href := urls.Eq(i).Attr("href") //获取当前分类的url
//去除为 # 的href
if href == "#" { //如果href为#,则后面的语句不执行,执行下一次循环
continue
}
//如果没有字符http的拼接上域名
if strings.Index(href, "http") == -1 {
href = DOMAIN + href
}
logger(text + " - " + href, "log.txt") //记录日志
//数组数据追加:数组不是连贯的数组(会有为空的情况),因为前面有continue操作;另外:设置了数组长度100,这里只有80多个分类,所以后面10多个一维数组都是空的。
result[i][0] = text
result[i][1] = href
}
return result //返回title,href二维数组结果数据
}
//写日志
func logger(content string, logFile string){
//判断文件是否存在,不存在则创建
_, err := os.Stat(logFile)
if err != nil {
os.Create(logFile)
}
//设置文件类型为追加
file, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE, 666)
if err != nil {
log.Fatalln("fail to open log file")
return
}
logger := log.New(file, "", log.LstdFlags) //设置日志的记录格式
logger.Println(content) //记录(写入)日志
}
log.txt 文件保存的内容如下
2020/05/20 12:31:14 MobileVersion - https://mobile.alphacoders.com
2020/05/20 12:31:14 NewestWallpapers - https://wall.alphacoders.com/newest_wallpapers.php
2020/05/20 12:31:14 FeaturedWallpapers - https://wall.alphacoders.com/featured.php
2020/05/20 12:31:14 CreatorWallpapers - https://wall.alphacoders.com/by_creator.php
2020/05/20 12:31:14 AuthorsAtWallpaperAbyss - https://wall.alphacoders.com/authors.php
2020/05/20 12:31:14 HDWallpapers - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080
2020/05/20 12:31:14 UltraHD4kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160
2020/05/20 12:31:14 Retina5kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=5120&h=2880
2020/05/20 12:31:14 UltraHD8kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=7680&h=4320
2020/05/20 12:31:14 PopularWallpapers - https://wall.alphacoders.com/popular.php
2020/05/20 12:31:14 CCOWallpapers - https://wall.alphacoders.com/by_license.php?filter=4
2020/05/20 12:31:14 RandomWallpapers - https://wall.alphacoders.com/random.php
2020/05/20 12:31:14 RecentComments - https://wall.alphacoders.com/comments.php
2020/05/20 12:31:14 RecentCaptions - https://wall.alphacoders.com/captions.php
2020/05/20 12:31:14 HighestRated - https://wall.alphacoders.com/highest_rated.php
2020/05/20 12:31:14 MostViewed - https://wall.alphacoders.com/by_views.php
2020/05/20 12:31:14 MostFavorited - https://wall.alphacoders.com/by_favorites.php
2020/05/20 12:31:14 MostCommentedOn - https://wall.alphacoders.com/by_comments.php
2020/05/20 12:31:14 PopularCollections - https://alphacoders.com/collections
2020/05/20 12:31:14 ByTag - https://wall.alphacoders.com/all_tags.php
2020/05/20 12:31:14 ByColor - https://wall.alphacoders.com/by_color.php
2020/05/20 12:31:14 iPhone11 - https://mobile.alphacoders.com/by-device/540/iPhone-11-Wallpapers?ref=wa
2020/05/20 12:31:14 iPhoneX - https://mobile.alphacoders.com/by-device/450/iPhone-X-Wallpapers?ref=wa
2020/05/20 12:31:14 GalaxyNote10 - https://mobile.alphacoders.com/by-device/543/Galaxy-Note10-Wallpapers?ref=wa
2020/05/20 12:31:14 GooglePixel4 - https://mobile.alphacoders.com/by-device/551/Pixel-4-Wallpapers?ref=wa
2020/05/20 12:31:14 18300Abstract - https://wall.alphacoders.com/by_category.php?id=1&name=Abstract+Wallpapers
2020/05/20 12:31:14 53634Animal - https://wall.alphacoders.com/by_category.php?id=2&name=Animal+Wallpapers
2020/05/20 12:31:14 180969Anime - https://wall.alphacoders.com/by_category.php?id=3&name=Anime+Wallpapers
2020/05/20 12:31:14 20318Artistic - https://wall.alphacoders.com/by_category.php?id=4&name=Artistic+Wallpapers
2020/05/20 12:31:14 30116Celebrity - https://wall.alphacoders.com/by_category.php?id=7&name=Celebrity+Wallpapers
2020/05/20 12:31:14 24640Comics - https://wall.alphacoders.com/by_category.php?id=8&name=Comics+Wallpapers
2020/05/20 12:31:14 6087Dark - https://wall.alphacoders.com/by_category.php?id=9&name=Dark+Wallpapers
2020/05/20 12:31:14 61436Earth - https://wall.alphacoders.com/by_category.php?id=10&name=Earth+Wallpapers
2020/05/20 12:31:14 23407Fantasy - https://wall.alphacoders.com/by_category.php?id=11&name=Fantasy+Wallpapers
2020/05/20 12:31:14 17620Food - https://wall.alphacoders.com/by_category.php?id=12&name=Food+Wallpapers
2020/05/20 12:31:14 1545Game - https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers
2020/05/20 12:31:14 7893Holiday - https://wall.alphacoders.com/by_category.php?id=15&name=Holiday+Wallpapers
2020/05/20 12:31:14 2398Humor - https://wall.alphacoders.com/by_category.php?id=13&name=Humor+Wallpapers
2020/05/20 12:31:14 33340ManMade - https://wall.alphacoders.com/by_category.php?id=16&name=Man+Made+Wallpapers
2020/05/20 12:31:14 523Men - https://wall.alphacoders.com/by_category.php?id=17&name=Men+Wallpapers
2020/05/20 12:31:14 10226Military - https://wall.alphacoders.com/by_category.php?id=18&name=Military+Wallpapers
2020/05/20 12:31:14 4026Misc - https://wall.alphacoders.com/by_category.php?id=19&name=Misc+Wallpapers
2020/05/20 12:31:14 45031Movie - https://wall.alphacoders.com/by_category.php?id=20&name=Movie+Wallpapers
2020/05/20 12:31:14 24587Music - https://wall.alphacoders.com/by_category.php?id=22&name=Music+Wallpapers
2020/05/20 12:31:14 17927Photography - https://wall.alphacoders.com/by_category.php?id=24&name=Photography+Wallpapers
2020/05/20 12:31:14 992Products - https://wall.alphacoders.com/by_category.php?id=25&name=Products+Wallpapers
2020/05/20 12:31:14 3168Religious - https://wall.alphacoders.com/by_category.php?id=26&name=Religious+Wallpapers
2020/05/20 12:31:14 17904SciFi - https://wall.alphacoders.com/by_category.php?id=27&name=Sci+Fi+Wallpapers
2020/05/20 12:31:14 14990Sports - https://wall.alphacoders.com/by_category.php?id=28&name=Sports+Wallpapers
2020/05/20 12:31:14 23041TVShow - https://wall.alphacoders.com/by_category.php?id=29&name=TV+Show+Wallpapers
2020/05/20 12:31:14 4204Technology - https://wall.alphacoders.com/by_category.php?id=30&name=Technology+Wallpapers
2020/05/20 12:31:14 67145Vehicles - https://wall.alphacoders.com/by_category.php?id=31&name=Vehicles+Wallpapers
2020/05/20 12:31:14 90168VideoGame - https://wall.alphacoders.com/by_category.php?id=32&name=Video+Game+Wallpapers
2020/05/20 12:31:14 1913Weapons - https://wall.alphacoders.com/by_category.php?id=34&name=Weapons+Wallpapers
2020/05/20 12:31:14 41289Women - https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers
2020/05/20 12:31:14 68451280x960 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=960
2020/05/20 12:31:14 130221280x1024 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=1024
2020/05/20 12:31:14 41271281x961 - https://wall.alphacoders.com/by_resolution.php?w=1281&h=961
2020/05/20 12:31:14 42381366x768 - https://wall.alphacoders.com/by_resolution.php?w=1366&h=768
2020/05/20 12:31:14 60081440x900 - https://wall.alphacoders.com/by_resolution.php?w=1440&h=900
2020/05/20 12:31:14 55191600x900 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=900
2020/05/20 12:31:14 20321600x1000 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1000
2020/05/20 12:31:14 268361600x1200 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1200
2020/05/20 12:31:14 150881680x1050 - https://wall.alphacoders.com/by_resolution.php?w=1680&h=1050
2020/05/20 12:31:14 1819081920x1080 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080
2020/05/20 12:31:14 1109811920x1200 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1200
2020/05/20 12:31:14 76841920x1280 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1280
2020/05/20 12:31:14 77931920x1440 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1440
2020/05/20 12:31:14 28732000x1333 - https://wall.alphacoders.com/by_resolution.php?w=2000&h=1333
2020/05/20 12:31:14 37412048x1152 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1152
2020/05/20 12:31:14 131212048x1365 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1365
2020/05/20 12:31:14 21762048x1366 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1366
2020/05/20 12:31:14 29052048x1367 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1367
2020/05/20 12:31:14 52452048x1536 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1536
2020/05/20 12:31:14 142022560x1440 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440
2020/05/20 12:31:14 399912560x1600 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1600
2020/05/20 12:31:14 20882560x1920 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1920
2020/05/20 12:31:14 58902880x1800 - https://wall.alphacoders.com/by_resolution.php?w=2880&h=1800
2020/05/20 12:31:14 52063000x2000 - https://wall.alphacoders.com/by_resolution.php?w=3000&h=2000
2020/05/20 12:31:14 199733840x2160 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160
2020/05/20 12:31:14 50833840x2400 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2400
2020/05/20 12:31:14 18775184x3456 - https://wall.alphacoders.com/by_resolution.php?w=5184&h=3456
2020/05/20 12:31:14 19555616x3744 - https://wall.alphacoders.com/by_resolution.php?w=5616&h=3744
2020/05/20 12:31:14 21265760x3840 - https://wall.alphacoders.com/by_resolution.php?w=5760&h=3840
2020/05/20 12:31:14 26816000x4000 - https://wall.alphacoders.com/by_resolution.php?w=6000&h=4000