import requests
word = input("请输入名字:")
url ="https://www.sogou.com/web"
params={
'query':word
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
resp = requests.get(url=url,params=params,headers=headers)
# 手动修改响应数据的编码
resp.encoding = "utf-8"
resp = resp.text
file_name = word+".html"
with open(file_name,"w",encoding="utf-8") as f:
f.write(resp)
print("ok")
urllib库图片保存本地:
from urllib import request
url = "http://pic.sc.chinaz.com/files/pic/pic9/202009/apic28014.jpg"
request.urlretrieve(url=url,filename="./456.png")
# bs4 爬取 https://www.shicimingju.com/book/sanguoyanyi.html
from bs4 import BeautifulSoup
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
# 请求首页解析出章节表情和详情页的url
url = "https://www.shicimingju.com/book/sanguoyanyi.html"
# 打开文件
fp = open("./text.txt","w",encoding="utf-8")
# 首页的页面源码数据
page_text = requests.get(url=url,headers=headers).text
# 数据解析的层级选择器
soup = BeautifulSoup(page_text,'lxml')
a_list = soup.select('.book-mulu>ul>li>a')
for item in a_list:
title = item.string # 标题数据
detail_url = "https://www.shicimingju.com"+item['href'] # 详情url
# 解析详情页
page_text_detail = requests.get(url=detail_url,headers=headers).text
soup_detail = BeautifulSoup(page_text_detail,'lxml')
content = soup_detail.find('div',class_='chapter_content').text
fp.write(title+":"+content+'\n')
print(title,'下载成功!')
# 爬取图片数据和图片名称保持到本地
from lxml import etree
import requests
# url = "http://pic.netbian.com/4kmeinv/index.html"
url = "http://pic.netbian.com/4kmeinv/index_%d.html"
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
for i in range(1,6):
if i == 1:
new_url = "http://pic.netbian.com/4kmeinv/index.html"
else:
new_url = format(url%i) # 表示非第一页url
response = requests.get(url=new_url,headers=headers)
response.encoding = 'gbk'
page_text = response.text
# 数据解析 图片地址和图片名称
tree = etree.HTML(page_text)
# 定位到所有的li标签
li_list = tree.xpath('//div[@class="slist"]/ul/li') # 全局解析
for li in li_list:
img_url = "http://pic.netbian.com"+li.xpath('./a/img/@src')[0] # 局部解析 ./ 表示当前标签li 因为返回的是一个列表所以[0]
img_name = li.xpath('./a/img/@alt')[0] + ".jpg"# 局部解析 ./ 表示当前标签
# 存入本地
img_data = requests.get(url=img_url,headers=headers).content
with open('./text%s' % img_name, 'wb') as f:
f.write(img_data)
print(img_name,"下载成功")
print("全部完成!!!")
url = "https://www.aqistudy.cn/historydata/"
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
hot_cities = tree.xpath("//div[@class='hot']//ul/li/a/text()")
all_cities = tree.xpath("//div[@class='all']//ul/div[2]/li/a/text()")
# 合并上面两个xpath表达式
data = tree.xpath("//div[@class='hot']//ul/li/a/text() | //div[@class='all']//ul/div[2]/li/a/text()")
# https://xueqiu.com/ 将学球网的新闻数据爬取
# 网站分析:第一屏数据不是动态数据,当鼠标滑到底部会发ajax请求数据
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
url = "https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=113797&size=15"
# 创建Session对象
session = requests.Session()
# 第一次操作Session,预测:访问首页会给我们返回cookie
session.get(url="https://xueqiu.com/",headers=headers)
# 使用携带cookie的session发送请求
json_data = session.get(url=url,headers=headers).json()
json_data
- 概念:代理服务器
- 代理服务器的作用:
- 转发请求从而更换请求ip地址
- 代理和爬虫之间的关联?
- 爬虫程序可能会在短时间内对指定的服务器发起高频请求,服务端会将该高频请求ip禁掉
- 代理的匿名度:
- 透明:对方服务器知道你使用了代理也知道你的真实ip
- 匿名:知道你使用了代理,但是不知道你的真实ip
- 高匿:不知道你使用了代理,更不知道你的真是ip
- 代理类型:
- http:只能代理http,https只能代理https
- http://http.zhiliandaili.cn/
- 错误解决方式:遇到HTTPConnectionOpool(host:xx) Max retri exceeded with url
- 将Connection 的值修改为close
import random
# 使用代理请求网站
all_ips = [
{'https':'xxxxx'},
{'https':'xxxxx'},
{'https':'xxxxx'}
]
url = "xxxx"
requests.get(url=url,proxies=random.choiceo(all_ips))
# 超级鹰的代码:
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
# 自己封装的一个识别验证码图片的函数
def transform_code_img(imgPath,imgType):
chaojiying = Chaojiying_Client('xiaozhulei', 'zhu1213556247', '908482') # 用户中心>>软件ID 生成一个替换 96001
im = open(imgPath, 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
return chaojiying.PostPic(im, imgType)['pic_str'] # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
# 调用识别验证码的函数
transform_code_img('./text/chaojiying_Python/a.jpg',4004)
from lxml import etree
sess = requests.Session() #创建好session对象
#处理动态变化的请求参数
#1.解析出本次登录页面对应的验证码图片地址
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
page_text = sess.get(url=login_url,headers=headers).text
tree = etree.HTML(page_text)
#解析出了验证码图片的地址
img_path = 'https://so.gushiwen.org'+tree.xpath('//*[@id="imgCode"]/@src')[0]
img_data = sess.get(url=img_path,headers=headers).content #请求到了图片数据
#将图片保存到本地存储
with open('./code.jpg','wb') as fp:
fp.write(img_data)
#将动态变化的请求参数从页面源码中解析出来
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
#识别验证码
code_result = transform_code_img('./code.jpg',1004)
print(code_result)
post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
data = {
"__VIEWSTATE":__VIEWSTATE,
"__VIEWSTATEGENERATOR":__VIEWSTATEGENERATOR,
"from": "http://so.gushiwen.org/user/collect.aspx",
"email": "[email protected]",
"pwd": "bobo328410948",
"code": code_result,
"denglu": "登录",
}
#模拟登录的请求
response = sess.post(url=post_url,headers=headers,data=data)
page_text = response.text #登录成功后页面的源码数据
with open('gushiwen.html','w',encoding='utf-8') as fp:
fp.write(page_text)
from selenium.webdriver import Chrome
import time
from lxml import etree
# 打开浏览器
chrome = Chrome()
# 在地址栏中输入百度网址
chrome.get("http://www.baidu.com/")
time.sleep(2)
# 获取当前页面显示的页面数据
page_text = chrome.page_source # 该属性可以返回当前页面所有被加载出来的源码数据
tree = etree.HTML(page_text)
# 全局解析
divs = tree.xpath("//div[@class='container']/div[@class='box']")
# 局部解析
for div in divs:
h1 = div.xpath('./h1/a/text()')[0]
print(h1)
# 退出
chrome.quit()
# 使用selenium模拟登录 https://kyfw.12306.cn/otn/resources/login.html
from selenium.webdriver import Chrome
from selenium.webdriver import ActionChains # 动作链
import time
# pip3 install pillow
from PIL import Image # 进行截图
chrome = Chrome()
chrome.get('https://kyfw.12306.cn/otn/resources/login.html')
chrome.maximize_window()
time.sleep(2)
chrome.find_element_by_xpath('/html/body/div[2]/div[2]/ul/li[2]/a').click()
# 通过id定位到input
username_tag = chrome.find_element_by_id('J-userName')
# 写入用户名
username_tag.send_keys('17683734889')
password_tag = chrome.find_element_by_id('J-password')
# 写入密码
password_tag.send_keys('zhu1213556247')
# 验证码处理:截图发个超级鹰
# 截图登录页面整张图
chrome.save_screenshot('./main.png')
# 只需要将验证码图片左下角和右上角的两点坐标获取到就可以定位到验证码的区域了
img_tag = chrome.find_element_by_xpath('//*[@id="J-loginImg"]')
location = img_tag.location # 当前标签在页面中左下角的坐标
size = img_tag.size # 当前标签在页面中尺寸
# 裁剪的区
rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
# 基于Image类提供的工具进行裁剪
i = Image.open('./main.png')
frame = i.crop(rangle)
frame.save('./code.png') # code.png就是验证码图片
# 识别验证码图片
result = transform_code_img('./code.png',9004) # 返回坐标(x1,y1|x2,y2)
# 需要将x1,y1,x2,y2,转化成[[x1,y1],[x2,y2]]
all_list = []#[[x1,y1],[x2,y2]]
if '|' in result:
list_1 = result.split('|')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x = int(result.split(',')[0])
y = int(result.split(',')[1])
xy_list = []
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
for pos in all_list:
x = pos[0]
y = pos[1]
# x, y 就是需要点击的一个坐标
# move_to_element_with_offset表示先定位到某一个标签,然后在根据x,y坐标进行点击
# perform 表示立即执行
ActionChains(chrome).move_to_element_with_offset(img_tag,x,y).click().perform()
time.sleep(1)
# # 点击登录按钮
chrome.find_element_by_id('J-login').click()
time.sleep(3)
# page_text = chrome.page_source
# page_text