B站上的爬取淘宝代码-python

基于requests库和re库,编写的爬取淘宝网站。

import re
import requests
def getHTMLText(url):
    try:
        header = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://www.taobao.com/',
            'accept-language': 'en,en-GB;q=0.9,zh;q=0.8,zh-CN;q=0.7,en-US;q=0.6',
            'cookie': 'cna=d21JF1QyZFsCAZ9BYR71xdyg; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; sgcookie=E4i9L9gKHEpxQIMjiPBeH; uc3=nk2=FPjangLZtTxJ6OM2&lg2=UIHiLt3xD8xYTw%3D%3D&vt3=F8dBxGJtZZ4KKxfRhRY%3D&id2=UojWlSfz7FajKg%3D%3D; lgc=wangyu155465; uc4=nk4=0%40FnNat4I5QG8jxg%2F1nEmRVlUqUVhZX6Y%3D&id4=0%40UOBStweVBZ3pFeiDJbNe%2BHoGKK3u; tracknick=wangyu155465; _cc_=UtASsssmfA%3D%3D; enc=I9pt99Wj0gIbiJBOqucj2SGIavHM0lSp8O0UGT6cnAlyWqe65G9jSwalAVHy8UBl21V80Ih0C2%2BMPXaRHMlWOA%3D%3D; tfstk=c-U1BQVTZdvs9EI2751EQrwW60uCaDASGGMg1k8TE1kpudNspsvBzY6kSovvJ0hC.; v=0; mt=ci=-1_0; cookie2=1f1553f3a651d86583719cc43a71e099; t=66fb8e3c9c82ae03405abd9e6e2e2fe2; _tb_token_=53e88a105388e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _nk_=wangyu155465; JSESSIONID=1C4852C4B39774C761FA4496C04E845C; uc1=cookie14=UoTV6eyS7JbKRA%3D%3D; l=eBQnLhL4Q05zmuICBOfanurza77OSIRYSuPzaNbMiOCPOM5p5wIhWZkDqJ89C3GNh6RXR3oIr-vXBeYBqIv4n5U62j-la_kmn; isg=BC8v83o_rLX5i6lJLZZvok9tvkM51IP2TTLHVEG8yx6lkE-SSaQTRi1CFoiu7Vtu',
        }
        r = requests.get(url, headers = header)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "页面爬取失败"
def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\":\"\d+\.\d*\"', html)
                        #r'\"view_price\":\"\d+\.\d*\"'
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
                        #r'\"raw_title\"\:\".*?\"'
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("解析出错")
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count +1
        print(tplt.format(count, g[0], g[1]))
def main():
    goods = "书包"
    depth = 2
    start_url = "https://s.taobao.com/search?q=" + goods
    infolist = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44*i)
            html = getHTMLText(url)
            parsePage(infolist, html)
        except:
            continue
    printGoodsList(infolist)
main()

其中,因为淘宝在2019年做了反爬机制,所以需要登录之后在爬取。需要在headers那里重写头部信息。具体可以参看这篇博客文章:https://blog.csdn.net/Guanhai1617/article/details/104120581。

getHTMLText(url): 爬虫获取网页信息。

parsePage(ilt, html): 解析返回的界面。

printGoodsList(ilt): 格式化输出信息。

在main()中,需要注意,deepth控制的是页面深度,即爬取多少页面。

你可能感兴趣的:(爬虫,代码生活,python)