以爬取包贝尔主页信息为例代码解释模拟登陆
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from lxml import etree
import time
#登录selenium
def login(url, user, password):
driver.get(url)
driver.maximize_window()
input_name = wait.until(EC.presence_of_element_located((By.ID, 'email')))
input_name.send_keys(user)
#查找密码输入框
input_password = wait.until(EC.presence_of_element_located((By.ID, 'password')))
#输入密码后回车
input_password.send_keys(password)
input_password.send_keys(Keys.RETURN)
time.sleep(5)
driver.get(ul)
return driver.page_source
获得个人简介和代表作,如需爬取其他信息,修改即可
# 利用xpath获取网页信息
def parse_page(html):
dom = etree.HTML(html)
user_list = dom.xpath('//h1[@class="avatar_title"]/text()')
user = user_list[0].strip()
topic_list = dom.xpath('//p[@class="authentication"]/text()')
topic = topic_list[0].strip()
return user,topic
ul为包贝尔主页,想获取别的网页更改ul即可
if __name__ == '__main__':
driver= webdriver.Chrome()
wait = WebDriverWait(driver, 10)
url = "http://www.renren.com"
ul = "http://www.renren.com/880792860/profile"
user = input("请输入用户名:")
password = input("请输入密码:")
html = login(url, user, password)
data = parse_page(html)
print('{}:{}'.format(data[0],data[1]))
driver.close()
只需修改上面2、4就行,第三方库和获取信息代码相同
修改2
import requests
from lxml import etree
#获取登录后网页源代码
#首先手动登录一次,将登录后的cookie加到headers中
def get_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Cookie': 'anonymid=k9ms9idkght7ea; depovince=GW; _r01_=1; JSESSIONID=abctH4Nu1Svub9nGWmlhx; ick_login=4ec1d30a-2073-4b56-957b-46a35291833a; taihe_bi_sdk_uid=84bc59a61b5ab331f13802d75d2ef4c1; taihe_bi_sdk_session=d5a1586e172ec99e162ebab493c1f30a; ick=4124d285-2c06-4b32-964c-eea910677987; XNESSESSIONID=1bfcb13bc2f6; first_login_flag=1; ln_uact=15832184726; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebe_key=138cdf0b-ded6-44c4-b2ed-d0f783393b53%7C9dac2838a292687518bbf3c5313bd627%7C1588253329601%7C1%7C1588253328944; jebe_key=138cdf0b-ded6-44c4-b2ed-d0f783393b53%7C9dac2838a292687518bbf3c5313bd627%7C1588253329601%7C1%7C1588253328951; wp=1; wp_fold=1; jebecookies=e614abe6-69db-48ea-8042-749ef59cac89|||||; _de=3A994080B05890C8BC1347536B8CB062; p=1d07e84cd3e56341724179838d20b9a07; t=b946317273136234495912a975d405697; societyguester=b946317273136234495912a975d405697; id=974339327; xnsid=348fb2fb; loginfrom=syshome'
}
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ""
cookies获取:手动登陆人人网,然后找到cookies做成字典。如图
使用开发者工具,右击点击检查,找到network,刷新,找到这个网页
用这个cookies代替代码中的cookies,其余不用改
修改4
url为包贝尔主页网址,如需爬取其他网址,修改即可
if __name__ == '__main__':
url = 'http://www.renren.com/880792860/profile'
html = get_page(url)
data = parse_page(html)
print('{}:{}'.format(data[0],data[1]))
修改2
import requests
from lxml import etree
def login(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
formdata = {"email": "15832184726",
#icode:
"origURL": "http://www.renren.com/home",
"domain": "renren.com",
"key_id": 1,
"captcha_type": "web_login",
"password": "89e63f831ebb68a47c50ade29ba604cabf1d1bdb11f1a5cd3a5fad1f34204c49",
"rkey": "e85e89d5b673c4d6ba6c114a47c6a7f1",
"f": "http%3A%2F%2Fwww.renren.com%2F974339327"
}
try:
r = s.post(url, headers=headers,data = formdata, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ""
def get_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
try:
r = s.get(url, headers=headers, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ""
formdata获取
回到登陆界面,打开开发者工具,找到network,勾选preserve log,然后登陆
按网页数据修改代码中的formdata
修改4
if __name__ == '__main__':
#登录地址
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=202045949278'
#创建会话对象
s = requests.session()
html_login = login(login_url)
print(html_login)
url = 'http://www.renren.com/880792860/profile'
html = get_page(url)
data = parse_page(html)
print('{}:{}'.format(data[0],data[1]))