@Python爬虫模拟网站登录
需要的工具
1.我们首先需要查看网站源码,这里用的是搜狗浏览器,它的开发者工具使用起来比较方便,Google Chrome和IE也有开发者工具,操作相似。
import urllib.parse
import json
#import chardet
import requests
import os
import sys
import random
from bs4 import BeautifulSoup
from urllib.request import quote,unquote
class Download:
def __init__(self):
self.server='https://www.gongzicp.com'
self.ncode='https://www.gongzicp.com/login/nationCode'
self.login='https://www.gongzicp.com/login/userLogin'
#设置多个用户信息,预防反爬虫
self.user_agents=[
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
]
#伪装报头
self.header = {
'Host':'www.gongzicp.com',
'Connection': 'keep-alive',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Origin': 'https://www.gongzicp.com',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
#'Referer':'https://www.gongzicp.com/novel-30233.html',
'Upgrade-Insecure-Requests':'1',
'User-Agent': self.user_agents[random.randrange(0,4)],
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
def getOpener(self,head):
# deal with the Cookies
self.cj = http.cookiejar.CookieJar()
self.pro = urllib.request.HTTPCookieProcessor(self.cj)
self.opener = urllib.request.build_opener(self.pro)
self.header = []
for key, value in head.items():
elem = (key, value)
self.header.append(elem)
self.opener.addheaders = self.header
return self.opener
def js(self):
self.opener = self.getOpener(self.header)
getUserInfo_url='https://www.gongzicp.com/login/nationCode'
op = self.opener.open(getUserInfo_url)
data = op.read()
#解压缩
data = gzip.decompress(data)
#解码
data=data.decode('unicode-escape')
print(data)
这里有两个需要注意的地方:
soup = BeautifulSoup(data, "html.parser")
#print(soup.prettify())
vue=str(soup.find_all('script',type='text/javascript'))
而remember的值也可以在源码中找到,应该是代表是否保存密码。这里选择了保存密码,所以为1。
4.获取了所有的数值后,就可以post数值进行登录了。
postDict = {
'ncode':'86',
'username': user_id,
'password': user_passw,
'remember': '1',
'zzz':zzz
}
postData = urllib.parse.urlencode(postDict).encode('utf-8')
login_op = self.opener.open(self.login, postData)
print('have login')
#获取用户信息,查看登录是否成功
getUserInfo_url='https://www.gongzicp.com/user/getUserInfo'
op = self.opener.open(getUserInfo_url)
data = op.read()
data = gzip.decompress(data)
data=data.decode('unicode-escape')
print(data)