from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
b = Chrome()
b.get('https://www.cnki.net/')
search = b.find_element_by_id('txt_SearchText')
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)
time.sleep(1)
a_list = b.find_elements_by_css_selector('table.result-table-list td.name>a')
a_list[0].click() # 点击第一个搜索结果
time.sleep(1)
浏览器对象.window_handles - 获取当前所有的选项卡,返回一个列表。
# 浏览器对象.window_handles - 获取当前所有的选项卡,返回一个列表
b.switch_to.window(b.window_handles[1])
# print(b.page_source)
# 获取摘要数据
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChDivSummary').text)
b.close()
b.switch_to.window(b.window_handles[0])
a_list[1].click()
time.sleep(1)
# 获取摘要数据
b.switch_to.window(b.window_handles[-1])
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChDivSummary').text)
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
import os
import re
# 点击第一个搜索结果
def get_one_page():
# 获取每个搜索结果对应的标签
a_list = b.find_elements_by_css_selector('table.result-table-list td.name>a')
# print(a_list, len(a_list))
all_data = []
for x in a_list:
x.click()
time.sleep(1)
# 5. 将选项卡切换到第二个页面,获取详情页数据
# 浏览器对象.window_handles - 获取当前所有的选项卡,返回一个列表
b.switch_to.window(b.window_handles[-1])
# print(b.page_source)
soup = BeautifulSoup(b.page_source, 'lxml')
# 题目
title = soup.select_one('div.container div.wx-tit>h1').text
# 获取摘要数据
if soup.select_one('#ChDivSummary'):
abstract = soup.select_one('#ChDivSummary').text
else:
abstract =None
# 关键词
if soup.select_one('div.container div.row>p.keywords>a'):
keyword = soup.select_one('div.container div.row>p.keywords>a').text
keyword = re.sub('\s', '', keyword)
else:
keyword = None
all_data.append([title, abstract, keyword])
b.close()
# 6. 回到第一页
b.switch_to.window(b.window_handles[0])
writer.writerows(all_data)
if __name__ == '__main__':
writer = csv.writer(open('files/知网数据分析.csv', 'a', encoding='utf-8', newline=''))
is_exists = os.path.exists('files/知网数据分析.csv')
if not is_exists:
writer.writerow(['论文题目', '摘要', '关键词'])
# 1. 创建浏览器对象
b = Chrome()
# 2. 打开网页
b.get('https://www.cnki.net/')
# 3. 搜索论文
search = b.find_element_by_id('txt_SearchText')
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)
time.sleep(1)
for _ in range(5):
# 获取一页的所有数据
get_one_page()
# 点击下一页
next_btn = b.find_element_by_id('PageNext')
next_btn.click()
time.sleep(3)
添加cookie完成自动登录。
import requests
# 添加cookie完成自动登录
headers = {
'cookie': r'_zap=5a0255c0-2631-4533-b00b-11fa5c5bce29; d_c0="AKBdbvd55xSPTialrdtjTCxWDdfhwIFmbfk=|1651911504"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=5LN74%2F0gFI9BRFRQUFPFBJLixsJlWxHn; _xsrf=sif67kGuztvrCG4lqumCAqog1ej8lC5C; captcha_session_v2=2|1:0|10:1652757335|18:captcha_session_v2|88:amRCVGh4eTN2SFFiVjBNRlNSRGFsTm85bmcrUGJSZGhSNWFma3UxaW1QNlhsNnBpa25MbG95R0xDcWlmeFNHTw==|6e620fa69489b60d1f5b83ba64de2a88020b0e6c36f14d63fc3de65b8ba7c043; __snaker__id=OZgoo8Q643crcGaQ; gdxidpyhxdE=gpAsrdc76JoW%5CAH7%5C6syS%2BfZKXeCBC%2BH7Nqg%5CyaPrKcSwI2S0eOIk5lt16d289pu7tCKzCmc8At4CS7weyBn4x2M7CwQK5iJWgLGd3YiaT2Q4uYixWJNPxu%5CyUSs8sQf%2BEndH5XB11GNqnBk5oc%2FMi6vE%2B3DL5%2BjD%2BacGGz3LdvvTm3%2B%3A1652758239269; YD00517437729195%3AWM_NI=JBeJoQFDBsMmmlDvVefYIBrGMcoPXstpFzybXyJDonY0qrqEobTBHr8Wkw0h9%2Fmt2hpMzvGhDHcxe%2FUTVTLxLZKpghvXo87QB6kp%2FQNVot6vXe1Pf3oReu0DieEX7%2FBMTEU%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeace23ea38a9d8ac560ae9e8ba6c54b929a8eb0d84b90938eb3c17f88f5bab3cd2af0fea7c3b92a8a9eababee5b869c8e94d067afb48c97d56fa68787a3f14d8fbda1b1db79b8b08a9bfc6a9ca786a5ee669bf0bf93f44fadbe9ca3cb7afc89a598b364fbf1a395b164b493b697c6648898ad99e825978c9ab5cf7490868591e664b688a1d9b65b82eaa591ae5b8e8aa195b57e9cbbffb1e539ad8eab96b350abbb8bb9b62582939ca6d837e2a3; z_c0="2|1:0|10:1652757841|4:z_c0|92:Mi4xOU04TUJBQUFBQUFBb0YxdTkzbm5GQ2NBQUFDRUFsVk5VYUtxWWdEcVNGX3BnTUlRVFFfcE5yMTNfeFZvYTFzSVZR|ce07c55884feb88c17e80bf9fb87f3bb3e087fba8fbb29a31d8e776712a5ddef"; q_c1=f2c0c46262fb43bca1ec55e7f5dc8a0d|1652757842000|1652757842000; NOT_UNREGISTER_WAITING=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1651911507,1652757336,1652767106,1652767236; SESSIONID=1CZ5prmJnH5MToLzVcvlNnGJxyvK9NCOKGjeLtVpNFf; JOID=Wl8VB0itC59FIacKIKBPBzrZ0uk8k0_Bd1raYG7ITK0McsNJXcMyHyslrwkjITdag_dBW4OaY8bgAwp76dr3Uac=; osd=UFAWBE6nBJxGJ60FI6NJDTXa0e82nEzCcVDVY23ORqIPccVDUsAxGSEqrAolKzhZgPFLVICZZczvAAl949X0UqE=; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1652767343; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1652767469|1652767234',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
res = requests.get('https://www.zhihu.com/', headers=headers)
print(res.text)
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
b = Chrome()
b.get('https://www.taobao.com')
input('完成登录:')
cookies = b.get_cookies()
with open('files/淘宝.txt', 'w', encoding='utf-8', newline='') as f:
f.write(str(cookies))
search = b.find_element_by_id('q')
search.send_keys('运动鞋')
search.send_keys(Keys.ENTER)
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
import os
import re
b = Chrome()
b.get('https://www.taobao.com')
cookie_list = eval(open('files/淘宝.txt', encoding='utf-8').read())
for cookie in cookie_list:
if cookie.get('secure'):
b.add_cookie(cookie)
time.sleep(1)
b.get('https://www.taobao.com')
# 后续操作
time.sleep(1)
search = b.find_element_by_id('q')
search.send_keys('运动鞋')
search.send_keys(Keys.ENTER)
time.sleep(1)
soup = BeautifulSoup(b.page_source, 'lxml')
#
goods_list = soup.select('div.grid.g-clearfix div.items>div')
# print(goods_list, len(goods_list))
writer = csv.writer(open('files/淘宝运动鞋.csv', 'w', encoding='utf-8', newline=''))
# if not os.path.exists('files/淘宝运动鞋.csv'):
writer.writerow(['商品名称', '价格', '店铺名称', '商品链接', '店铺链接', '付款人数'])
all_data = []
for goods in goods_list:
# 商品名
name_goods = goods.select_one('div.row.row-2.title>a').text
name_goods = re.sub('\s', '', name_goods)
# print(name_goods)
# 价格
price = goods.select_one('div.price.g_price.g_price-highlight>span').text + goods.select_one('div.price.g_price.g_price-highlight>strong').text
# 店铺名称
name_shop = goods.select_one('div.shop > a > span:nth-child(2)').text
# 商品链接
href_goods = goods.select_one('div.ctx-box.J_MouseEneterLeave.J_IconMoreNew div.row.row-2.title>a').attrs['href']
if 'https:' not in href_goods:
href_goods = 'https:' + href_goods
# 店铺链接
href_shop = goods.select_one('div.shop>a').attrs['href']
if 'https:' not in href_shop:
href_shop = 'https:' + href_shop
# 付款人数
payment = goods.select_one('div.ctx-box.J_MouseEneterLeave.J_IconMoreNew div.deal-cnt').text
print(name_goods, price, name_shop, href_goods, href_shop, payment)
all_data.append([name_goods, price, name_shop, href_goods, href_shop, payment])
writer.writerows(all_data)
1)设置隐式等待时间: 浏览器对象.implicitly_wait(超时时间)。
2)设置等待时间,每次通过浏览器对象获取网页内容的时候,如果取的时候没有获取到标签程序不会马上报错,而是在指定时间范围内不断尝试重新获取这个标签,直到获取到或者超时为止。
2)注意:一个浏览器只需要设置一次隐式等待时间,它会作用于每次获取标签。
等待某个条件成立或者不成立才继续执行程序。
1)创建等待对象:WebDriverWait(浏览器对象, 超时时间)
2)添加等待条件:等待对象.until(条件)、等待对象.until_not(条件)
1)text_to_be_present_in_element_value(标签, 内容):当指定标签的value属性值中包含了指定内容
2)presence_of_element_located(标签):当前指定标签出现
3)element_to_be_clickable(标签):当指定标签可以点击
4)text_to_be_present_in_element(标签, 值):当指定标签的标签内容中包含指定值
注意:标签提供方法:(By.XXX, 值),例如(By.ID, ‘key’)、(By.CLASS_NAME, ‘user_login’) 。
# 创建浏览器打开网页
b = Chrome()
b.get('https://www.jd.com')
# 1)设置隐式等待时间: 浏览器对象.implicitly_wait(超时时间)
b.implicitly_wait(5)
search = b.find_element_by_id('key')
# 2)显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# a. 创建等待对象
wait = WebDriverWait(b, 20)
print('==========1==========')
# b. 添加等待条件
# wait.until(EC.text_to_be_present_in_element_value((By.ID, 'key'), '鞋子'))
wait.until_not(EC.presence_of_element_located((By.CLASS_NAME, 'user_login')))
print('==========2==========')
b.refresh() # 刷新
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
res = requests.get('https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js', headers=headers)
result = res.json()
# print(result)
for x in result['hero']:
print(x['name'], x['title'])
import requests
import os
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
import time
from selenium.webdriver import Chrome
# 获取每个英雄的皮肤图片并写入英雄对应的文件中,皮肤图片用皮肤命名
# 先找到每个英雄的详情数据接口,规律
# https://game.gtimg.cn/images/lol/act/img/js/hero/1.js
# https://game.gtimg.cn/images/lol/act/img/js/hero/2.js
def one_hero(id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
res = requests.get(f'https://game.gtimg.cn/images/lol/act/img/js/hero/{id}.js', headers=headers)
result = res.json()
# 获取英雄名字
hero_name = result['hero']['name']
print(hero_name)
# 创建以英雄名字命名的文件夹
os.makedirs(f'files/英雄联盟皮肤/{hero_name}')
for x in result['skins']:
# 获取皮肤名字
skins_name = x['name']
skins_name = skins_name.replace('/', '')
# 获取皮肤图片地址
img_url = x.get('loadingImg')
print(skins_name, img_url)
# 解析图片保存在文件中
if img_url:
res_img = requests.get(img_url)
# print(res_img.content)
# res_img.encoding = 'utf-8'
with open(f'files/英雄联盟皮肤/{hero_name}/{skins_name}.jpg', 'wb') as f:
f.write(res_img.content)
if __name__ == '__main__':
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
# }
# # 获取英雄个数,遍历每个英雄
# res = requests.get('https://lol.qq.com/data/info-heros.shtml', headers=headers)
# time.sleep(1)
# res.encoding = 'gbk'
# soup = BeautifulSoup(res.text, 'lxml')
# print(soup)
b = Chrome()
b.get('https://lol.qq.com/data/info-heros.shtml')
time.sleep(1)
soup = BeautifulSoup(b.page_source, 'lxml')
# 拿到每个英雄的id
hero_li = soup.select('#jSearchHeroDiv>li')
# print(hero_li)
for hero in hero_li:
if hero.select_one('li>a'):
id = hero.select_one('li>a').attrs['href'].split('info-defail.shtml?id=')[-1]
# print(id)
one_hero(id)