嵩天老师在B站上的课程部分源代码2

淘宝商品定向爬虫

# _*_ coding:utf-8 _*_
# 开发时间:2020/2/18 21:24
# 文件名称:reptile_practice.py
# 开发工具:PyCharm
import requests
import re
#主要是这个header有点难搞(在network下刷新找到其中的doc文档进行复制以及转码)
#还算比较经典的一次实战
headers = {
    'authority': 'www.taobao.com',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36',
    'sec-fetch-dest': 'iframe',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'same-site',
    'sec-fetch-mode': 'navigate',
    'referer': 'https://s.tabao.com/search?q=^%^E4^%^B9^%^A6^%^E5^%^885^&imgfile=^&commend=all^&ssid=s5-e^&search_type=item^&soceId=tb.index^&spm=a21bo.2017.201856-taobao-item.1^&ie=utf8^&initiave_id=tbindexz_20170306',
    'accept-language':'zh-CN,zh;q=0.9',
    'cookie': 'miid=469011642127017; cna=xxzsFfT1dQsCASemacZI2YwG; _m_h5_tk=9b0501a44dff376d83c32888ca00_1582041778686; _m_h5_tk_enc=6768a279ae8f0417934b6aeefbcfb0; thw=cn; t=150a09b2f98c357b8a8eb21d4ef959; cookie2=1b68481cb2a0dd9a12bdd28b8c241; v=0; _tb_token_=e03ee37de783; _mesite_flag_=true; lgc=^%^5Cu5728^%^5Cu90A3^%^5Cu6^%^5Cu6211^%^5Cu4E0D^%^5Cu66FE^%^5Cu9A7B^%^5Cu8DB3; dnk=^%^5Cu5728^%^5Cu90A3^%^5Cu65^%^5Cu6211^%^5Cu4E0D^%^5Cu66FE^%^5Cu9A7B^%^5Cu8DB3; tracknick=^%^5Cu5728^5Cu90A3^%^5Cu65F6^%^5Cu6211^%^5Cu4E0D^%^5Cu66FE^%^5Cu9A7B^%^5Cu8DB3tg=0; enc=ItdPAoOSM4vldMdO22debFNNQMFSWptsGgoDS2RoYFUqLQhLMBsAekd1EwT3VER^%^2FgN3gMyXblXa0dsXX%^3D^%^3D; hng=CN^%^7Czh-CN^%^7CCNY^%^7C156; mt=ci=25_1; unb=2725021; uc3=id2=UU8IPTi0Xzl%^3D^%^3&lg2=URm48syIIVrSKA^%^3D^%^3D^&nk2=twxUHx5^%^2B1GxN^2Bs3UdLx3D^%^3D^&vt3=F8dBxdz2Q6rlX^%^2BH9YQs^%^3D; csg=3dc085df; cookie17=UU8i0Xzlj7w^%^3D^%^3D; skt=70247df8b4854b4b; existShop=MTU4MjA5NTQyOA^%^3D^%^3D; uc4=id4=0^%^40U22PGM3r4vPXg2PNhmks2MH13W2W^&nk4=0^%^40tX^%^2FI1WqVaWNigZKFjJcs6Ttgds765y _cc_=V32FPkk^%^2Fhw^%^3D^%^3D; _l_g_=Ug^%^3D^%^3D; sg=^%^E8^%^B6^%^B318; _nk_=^%^5Cu5728^%^5Cu90A3^%^5Cu65F6^%^5Cu6211^%^5Cu4E0D^%^5Cu66FE^%^5Cu9A7B^%^u8DB3; cookie1=Vqhw5yAh1q0K3BY2gA0h7ZNuLvLyvDgwacguRY^%^3D; uc1=cookieUIHiLt3xCS3yM2hHS9lpEOw^%^3D^%^3D^&cookie21=U^%^2BGCWk^%^2F7p4mBoUyS4E9C^&cookie15=VT5L2FSpMGV7TQ^%^3D^%^3D^&existShop=false^&pas=0^&cookie14=UoLPOqNOIgQ^%^3D^%^3D^&cart_m=0^&tag=8^&lng=zh_CN; isg=BKqqAYiBgCm0dwxCgvRNSLSE-xBMGy51j51wKjRjVv2IZ0ohHKt-hfCV95P7jKYN; l=cBQ8bQRWjxEACBOCanurza77OSIRYYuPzaNbMi_5Bw6T1y2_OoWI9PF96VjWd9XTB4KJvV7p9-etkZRwTm4K-g3fP.',
    'if-none-match': 'W/^\\^2c42602480f459^\\^',
}


def get_html_text(url):
    """获取网页的函数"""
    try:
        req = requests.get(url,headers=headers, timeout=30)
        req.raise_for_status()
        req.encoding = req.apparent_encoding
        return req.text
    except:
        return ''

def parser_page(ilt, html):
    """将内容进行转码"""
    try:
        plt = re.findall(r'"view_price":"[\d.]*"', html)
        tlt = re.findall(r'"raw_title":".*?"', html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])      #只是系统报错,这条语句的作用是分割出价格字符串出来,然后
            title = eval(tlt[i].split(':')[1])      #eval的作用将字符串的双引号去掉
            ilt.append([price, title])
    except:
        print('')

def print_goods_list(ilt):
    """将内容进行打印"""
    tplt  = "{:4}\t{:8}\t{:16}"
    print(tplt.format('序号','价格','商品名称'))
    count = 0
    for g in ilt:
        count += 1
        print(tplt.format(count, g[0], g[1]))


def main ():
    """主函数"""
    #里面包含的try-except函数是为了解决爬取过程中的某些信息的错误
    #但是也不利于检查错误,自己爬取的时候首先应该先不编写试试看(如果对自己的代码没有足够的信心的话)
    goods = '书包'
    depth = 2
    start_url = 'https://s.taobao.com/search?q=' +goods
    infolist = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(i*44)
            html = get_html_text(url)
            parser_page(infolist, html)
        except:
            continue
    print_goods_list(infolist)

main()

技术路线:request–re库
ps:这个还是要预先登录的。

你可能感兴趣的:(嵩天老师在B站上的课程部分源代码2)