使用requests库和正则表达式爬取淘宝商品信息(入门版)

使用requests库和正则表达式爬取淘宝商品信息(入门版)

最近在b站上学习python爬虫,根据老师的教程打了一遍,出现了一些问题,主要是因为淘宝的反扒机制,但在增加headers和cookie之后就没什么问题了。
可以参考这篇教程设置cookie:https://www.cnblogs.com/huahuayu/p/8207037.html

附上源码:

import requests  # 引入requests库
from bs4 import BeautifulSoup  # 引入beautifulsoup库
import bs4
import re  # 引入正则表达式库


# 通过requests获取网站内容
def getHTMLText(url):
    headers = {  # 反反爬取,添加cookie  
        'authority': 's.taobao.com',
        'cache-control': 'max-age=0',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36',
        'sec-fetch-dest': 'document',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-site',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'referer': '*',		# referer和cookie略去
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'cookie': '*',
    }

    try:
        print(url)
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()  # 若r.status_code不等于200,则报错
        r.encoding = r.apparent_encoding
        print(r.request.url)
        # print(r.text[0:50000])
        return r.text
    except:
        print('getHTMLText Fail')
        return ""


# 对HTML解码
def parsePage(ilt, html):
    price_regex = re.compile(r'\"view_price\"\:\"[\d\.]*\"')  # re编译,原生字符串
    title_regex = re.compile(r'\"raw_title\"\:\".*?\"')  # 这里的\好像没什莫用
    try:
        plt = price_regex.findall(html)  # 返回匹配的字符串列表
        tlt = title_regex.findall(html)
        for t in range(len(plt)):
            price = plt[t].split('"')[-2]       # 使用eval()有一定风险,所以直接用"分割
            title = eval(tlt[t].split(':')[1])  # eval()会去掉"",用:分割
            ilt.append([price, title])
    except:
        print('parsePage Fail')


# 打印商品信息
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:^16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for i in ilt:
        count = count + 1
        print(tplt.format(count, i[0], i[1]))


def main():
    goods = '书'
    depth = 2
    start_url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.2017.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=' + str(
                44 * i)
            html = getHTMLText(url)
            print("getHTMLText Succeed")
            parsePage(infoList, html)
            print("parsePage succeed")
        except:
            continue
    printGoodsList(infoList)


main()

你可能感兴趣的:(python)