基于urllib的基础爬虫

  • 爬取站长素材的基本图片数据
import urllib.request
import urllib.parse
from lxml import etree
import ssl

#SSL错误处理
ssl._create_default_https_context = ssl._create_unverified_context

#返回请求对象
def create_request():
    #请求地址
    url = 'http://sc.chinaz.com/tag_tupian/YaZhouMeiNv.html'
    #请求头
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    }
    #请求对象
    request = urllib.request.Request(url=url,headers=headers)
    return request

def create_content(request):
    response = urllib.request.urlopen(request)
    context = response.read().decode('utf-8')
    tree = etree.HTML(context)
    #xpath解析,返回list
    name_list = tree.xpath('//div[@class="box picblock col3"]//img/@alt')
    img_list = tree.xpath('//div[@class="box picblock col3"]//img/@src2')
    list = []
    for i in range(len(name_list)):
        dict1 = {}
        name = name_list[i]
        img = img_list[i]
        dict1['title'] = name
        dict1['img'] = img
        list.append(dict1)
    with open('站长素材数据.json','w',encoding='utf-8') as fp:
        fp.write(str(list))


if __name__ == '__main__':
    request = create_request()
    create_content(request)

你可能感兴趣的:(基于urllib的基础爬虫)