利用Cookie爬取带登录账号的网站

# 爬虫**网站
import requests
from bs4 import BeautifulSoup

headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Cookie':'用账户登录后获取的网页Cookie'
}
res=requests.get("需要爬取的网页",headers=headers)
soup=BeautifulSoup(res.text,'html.parser')
print(soup)

实验环境搜狗浏览器

按照如下步骤获取用账户登录后获取的网页Cookie

step1:登录账号;

step2:按F12,进入Network->Doc选项

step3:Ctrl+R(或F5)刷新,找Name,Status,Domin那一行,通常第一行就是要找的,单击进去(不是双击),选择cookie字段下所有的内容复制,粘贴到上边代码“账户登录后获取的网页Cookie”字样处,同时,在“需要爬取的网页”字样处输入Cookie对应网址;

step4:执行即可;

 

下面附网络爬虫程序一个仅供学习参考(ps:请遵循robots协议)

import requests
import re
self={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Cookie':'t=9b2e3b2d2ca9cc3ea9b377e34038612e; thw=cn; enc=FLDBgHQZTLcLCXVYOvApOOnLL%2F1Akz4Ed55sa0qt8DDjELEqawC35W1FP%2FTcpXPPcBtkuhGDla9dTXLH7O5SDw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; _samesite_flag_=true; cookie2=1a5992fbc34dd5f585d150f0c4509f83; _tb_token_=5373e3e3edbe1; cna=OE2GFruxfjQCAbfUvCJSoOWg; sgcookie=E2OEGOcm47eBTHqzpF991; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dBxGR%2FhWk9RQvcyZI%3D&nk2=Cywt8xwZqs%2FKosL22Q%3D%3D&id2=UoTV6eqcJkRjIw%3D%3D; csg=2ddf0913; lgc=hanshuaihohai; dnk=hanshuaihohai; skt=96edeee27ff1dbaa; existShop=MTU4ODI2NTU4Ng%3D%3D; uc4=id4=0%40UOx%2FUchFS%2BOU79KlGDMAnH3u25zm&nk4=0%40CWshcrroMW23SiuQWBZ5fcdXWl62QOpv; tracknick=hanshuaihohai; _cc_=UIHiLt3xSw%3D%3D; tfstk=ceb5BL9lZYD5Z_bru9N24OYkZDYhaB0Xi49VNiLvlaMS1k5yHsxn7djbkpDjOXAf.; mt=ci=10_1; v=0; l=eBrS8yFnQ28pP0OQBOfZourza77OtBdYYuPzaNbMiT5P_b1HXeblWZjGWbYMCnGVh6rvR3-ormWwBeYBqiAgJeHab_YQu2Dmn; isg=BPPzpFHKqTEiZmVRiiXIZHW0jPcdKIfq22zaZ6WQAJJ1pBNGLfs0Onp2WtRKH9_i; uc1=cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&cookie21=URm48syIYB3rzvI4Dim4&existShop=false&pas=0&cookie14=UoTUMtLek5Btug%3D%3D'
}
def getHtmlText(url):
    try:
        r = requests.get(url,headers=self)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def parsePage(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("")
    
def printGoodList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count = 0
    print("长度",len(ilt))
    for g in ilt:
        count = count+1
        print(tplt.format(count,g[0],g[1]))
        
def main():
    goods = '洗衣机'
    depth = 2
    start_url = '网站地址'+goods
    infoList=[]
    for i in range(depth):
        try:
            url = start_url+ '&s='+ str(44*i)
            html=getHtmlText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodList(infoList)
    
main()

 

你可能感兴趣的:(利用Cookie爬取带登录账号的网站)