爬取书法网站的草书图片

爬取书法网站的草书图片

书法网站的链接为http://www.shufazidian.com/

import requests
from bs4 import BeautifulSoup
import os


def get_page(url,word):
    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko Core/1.70.3704.400 QQBrowser/10.4.3587.400",
             "referer": "http://www.shufazidian.com/",
             "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN",
             "Cache-Control": "no-cache",
             "Connection": "Keep-Alive",
             "Content-Length": "19",
             "Content-Type": "application/x-www-form-urlencoded",
             "Cookie": "cookiesort=7; Hm_lvt_5ac259f575081df787744e91bb73f04e=1563974376,1564218809; Hm_lpvt_5ac259f575081df787744e91bb73f04e=1564226330",
             "Host": "www.shufazidian.com"
        }
        data = {
            'wd': word,
            'sort': 7
        }
        r = requests.post(url, headers= headers,data= data) # post请求
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.content
    except:
        return ""

def parse_page(html):
    soup = BeautifulSoup(html ,"lxml")      #解析网页
    pics = soup.find_all(class_="mbpho")    #获得图片所在的标签
    pic_link = list()
    name = list()
    for i in range(1,len(pics)):
        pic = pics[i].find(name="a").find(name="img")["src"]    #获得图片的链接并存入列表
        pic_link.append(pic)
        title = pics[i].find(name="a")["title"] #获得图片的作者并存入列表
        name.append(title)
    pic_dic = dict(zip(pic_link,name))  #构造图片和作者一一对应的字典
    return pic_dic
    #print(pic_dic)

def to_file(url,word):
    if not os.path.exists("E://shufa"): #创建书法目录
        os.mkdir("E://shufa")
    path = "E://shufa//"+word   #创建搜索图片目录
    if not os.path.exists(path):
        os.mkdir(path)
    os.chdir(path)              #改变当前工作目录到path

    html = get_page(url, word)  #获得网页的html
    pic_dic = parse_page(html)  #解析网页html,返回图片链接和图片作者对应的字典
    #print(pic_dic)
    header = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko Core/1.70.3704.400 QQBrowser/10.4.3587.400",
            "Cookie": "cookiesort=7; Hm_lvt_5ac259f575081df787744e91bb73f04e=1563974376,1564218809; Hm_lpvt_5ac259f575081df787744e91bb73f04e=1564226330"

    }
    for item in pic_dic:
        #url = item
        try:
            response = requests.get(item, headers=header)
            if response.status_code == 200:
                open(pic_dic.get(item) + ".jpg", 'wb').write(response.content)
                print("{} 保存成功".format(pic_dic.get(item)))
        except:
            return ''

def main ():
    url = "http://www.shufazidian.com/"
    words = ["刘","陶","林","张","任","爱","你","我","草","书"]
    for word in words:
        to_file(url,word)




"""def main():
    url = "http://www.shufazidian.com/"
    words = ["刘","陶","林"]
    link = list()
    name = list()
    i=0
    if not os.path.exists("E://shufa"):
        os.mkdir("E://shufa")
    for word in words:

        html = get_page(url,word)
        pic_dic = parse_page(html)
        path = "E://shufa//" + word

        if not os.path.exists(path):
            os.mkdir(path)
        os.chdir(path)
        print(word)

        for item in pic_dic:
            #url = item
            print(str(url))
            header = {
                'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                                '(KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
            }
            response = requests.get(item,headers=header)
            if response.status_code == 200:
                with open(pic_dic.get(item)+".jpg",'wb') as f:
                    f.write(response.content)
                    print("保存成功")
                    
            
              

            link.append(item)
            i = i+1
            name.append(pic_dic.get(item))
    print(name)"""

        #print(pic_dic)

if __name__ == '__main__':
    main()

你可能感兴趣的:(学习笔记)