爬虫的目的当然不仅仅是能下载图片网页视频等,大部分情况还是需要
获取数据
,获取数据的话就需要对html,xml,json等文件进行处理
xpath选择器
,BeautifulSoup
来选取网页节点,进一步获取数据requests
库,代替urllib.request,用来请求、代理他们的用法这里不做记录。
代理分类
- 透明(表面上是代理ip 实际上用的还是真实ip)
- 匿名(不会用真实的ip,知道是代理ip 但无法识别ip地址)
- 高匿(模拟浏览器,使用的是代理ip 以假乱真)
使用场景
爬取网站反爬机制会对ip进行限制(封ip)
使用
import urllib.request
import urllib.parse
# 配置代理对象,把协议作为键,主机和端口号为值
handler = urllib.request.ProxyHandler(proxies={'http':'183.33.192.247:9797'})
url = 'https://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('proxy.html','w',encoding='utf-8') as fp:
fp.write(content)
fiddler
直接去官网下载fiddler
mac安装还需要下载mono
安装教程
mac启动
mono --arch=32 Fiddler.exe
注意
:使用了一下发现在mac上很不友好,windows下好评还是很多的
mac下使用青花瓷
Charles
下载
安装配置
Xpath插件的使用
- 打开Chrome浏览器的扩展程序(设置-更多工具-扩展程序)
- 把Xpath.crx拖拽到里面
- 把浏览器退出,重启
- Ctrl+Shift+X打开xpath帮助工具(浏览器上面的黑色输入框)
- 按住shift,鼠标移动到网页内的任意元素位置上,可以自动定位该元素的xpath路径
Sublime,
alt+command+f打开替换,windows是ctrl+h
上边输入
(.*?):\s(.*)
下边输入
"\1":"\2"
就可以替换为json正常格式>
import urllib.request
import urllib.parse
from lxml import etree
import os
def get_url(page):
if page == 1:
url = 'http://sc.chinaz.com/tupian/shuaigetupian.html'
else:
url = 'http://sc.chinaz.com/tupian/shuaigetupian_{}.html'.format(page)
return url
def request_data(url):
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
response = opener.open(request)
return response
def parse_data(response):
html_str = response.read().decode('utf-8')
# etree.parse()
tree = etree.HTML(html_str)
img_xpath = '//div[@id="container"]/div//img'
img_nodes = tree.xpath(img_xpath)
items = []
for img_node in img_nodes:
src = img_node.xpath('./@src2')[0]
alt = img_node.xpath('./@alt')[0]
item = {
'src':src,
'alt':alt
}
items.append(item)
return items
def download_data(items):
for item in items:
src = item['src']
alt = item['alt']
suffix = src.split('.')[-1]
file_name = alt+'.'+suffix
file_path = os.path.join('shuaige',file_name)
urllib.request.urlretrieve(src,file_path)
print(file_name+'下载完成')
print('全部下载完成')
if __name__ == '__main__':
page = int(input('请输入要查询的页码:'))
url = get_url(page)
response = request_data(url)
items = parse_data(response)
download_data(items)
import urllib.request
import urllib.parse
from lxml import etree
from bs4 import BeautifulSoup
url = 'http://quote.stockstar.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('gb2312')
# tree = etree.HTML(content)
soup = BeautifulSoup(content,'lxml')
# 使用BeatifullSoup定位tr_list
tr_list = soup.select('#datalist tr')
# 使用BeatifullSoup
for tr in tr_list:
td_list = tr.find_all('td')
for td in td_list:
# 如果string不行,还get_text()一定行
print(td.string)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
from Item import GSXX
import json
def request_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
data = {
'jl':'北京',
'kw':'python',
'p':1,
}
data = urllib.parse.urlencode(data)
url = url + data
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
return response
def parse_data(response):
content = response.read().decode('utf-8')
soup = BeautifulSoup(content,'lxml')
table_list = soup.select('#newlist_list_content_table table')[1:]
items = []
for table in table_list:
zwmc = table.select('.zwmc')[0].get_text().strip('\n')
gsmc = table.select('.gsmc')[0].get_text().strip('\n')
zwyx = table.select('.zwyx')[0].get_text().strip('\n')
gzdd = table.select('.gzdd')[0].get_text().strip('\n')
item = GSXX(zwmc,gsmc,zwyx,gzdd)
items.append(item)
return items
def transform_data(item):
return {name:item.name}
def save_data(items):
for item in items:
json_obj = item.__dict__
json.dump(json_obj,open('zhilian.json','a',encoding='utf-8'),ensure_ascii=False)
def study_json():
# 关于json文件操作的几个方法
# pickle模块 序列化操作
# json.load() 从json文件中读取json对象
# json.loads() 把一个字符串对象读取成一个json对象
# json.dump() 把一个json对象写入到文件中
# json.dumps() 把一个json对象读取成字符串对象
dic = {
'name':'dancer',
'age':10
}
json_str = json.dumps(dic)
print(type(json_str))
json.dump(dic,open('test.json','w',encoding='utf-8'))
json_obj = json.load(open('test.json','r',encoding='utf-8'))
print(json_obj,type(json_obj))
json_obj2 = json.loads(json_str,encoding='utf-8')
print(type(json_obj2),json_obj2)
if __name__ == '__main__':
# 带参数的GET请求
url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?'
response = request_data(url)
items = parse_data(response)
save_data(items)
Selenuim使用
- Selemuim 模拟浏览器行为
- pantormJS
import requests
from lxml import etree
import json
from selenium import webdriver
import time
def handle_video(name, url):
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'
}
# r = requests.get(url=url, headers=headers)
# 在这又不对了,因为video标签是通过js动态加载进来的
# html_tree = etree.HTML(r.text)
# video_src = html_tree.xpath('//video[@class="vjs-tech"]')
# print(video_src)
# 发现数据也是动态添加进来的,所以这个时候使用phantomjs和selenium请求页面,执行其中的js代码,将真正的数据展示给你即可,然后再去解析它
# 创建浏览器对象
driver = webdriver.PhantomJS('phantomjs.exe')
# 向指定地址发送请求
driver.get(url)
# 让程序稍微休眠一下,去执行里面的js代码
time.sleep(5)
# 【注】得到网页源代码,此时page_source是执行js之后的网页代码
html_tree = etree.HTML(driver.page_source)
video_src = html_tree.xpath('//video[@class="vjs-tech"]/source/@src')[0]
# print(video_src)
# with open('1.html', 'w', encoding='utf-8') as fp:
# fp.write(driver.page_source)
r = requests.get(url=video_src, headers=headers)
filename = './video/' + name + '.mp4'
with open(filename, 'wb') as fp:
fp.write(r.content)
print(filename + '下载完毕')
def handle_index(url, json_url):
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'
}
r = requests.get(url=json_url, headers=headers)
# 是否会将json格式的字符串转化为python对象
# print(r.text)
obj = json.loads(r.text)
# print(len(obj['data']))
print('开始下载。。。。。。。')
for video_obj in obj['data']:
# 获取视频名字
name = video_obj['video_id']
# 获取视频连接
video_url = 'https://365yg.com' + video_obj['source_url']
# print(name, video_url)
# 获取视频连接,并且下载视频
handle_video(name, video_url)
print('结束下载')
# 如下方式是不对的,因为内容是通过js动态添加的,所以我们需要抓包,捕获数据接口
# 通过xpath获取所有符合要求的a连接
# html_tree = etree.HTML(r.text)
# a_list = html_tree.xpath('//div[@class="title-box"]')
# print(a_list)
def main():
# 首页的url
url = 'https://365yg.com/'
json_url = 'https://365yg.com/api/pc/feed/?category=video&utm_source=toutiao&widen=3&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1253AF48A0FCE5&cp=5A4A4F4CAEE59E1&_signature='
# 得到首页的内容,获取里面所有的符合条件的a标签的href
handle_index(url, json_url)
if __name__ == '__main__':
main()
阳光视频下载
from selenium import webdriver
from handless import share_browser
import time
from lxml import etree
import os
import requests
shouye_url = "http://www.365yg.com/"
driver = share_browser()
driver.get(shouye_url)
time.sleep(3)
driver.execute_script("document.documentElement.scrollTop=1000000")
time.sleep(3)
detail_xpath = '//a[@ga_event="video_comment_click"]/@href'
tree = etree.HTML(driver.page_source)
detail_list = tree.xpath(detail_xpath)
for detail in detail_list:
detail_url = 'http://www.365yg.com' + detail
driver.get(detail_url)
time.sleep(1)
tree1 = etree.HTML(driver.page_source)
src_xpath = '//video/@src'
src = tree1.xpath(src_xpath)[0]
name = tree1.xpath('//h2/text()')[0]
if not src.startswith("http:"):
src = 'http:' + src
file_path = 'ygvideo'
file_name = name + '.mp4'
path = os.path.join(file_path,file_name)
r = requests.get(src)
with open(path,'wb') as fp:
fp.write(r.content)
利用第三方平台集成环境
云打码
- 注册开发者账号
- 登录并创建软件
- 记住软件id,key
- 下载python调试文档
- 将两个dll文件及py文件放到项目目录下
YDpython3.x.py
# -*- coding: cp936 -*-
import sys
import os
from ctypes import *
class Recgonizier():
def __init__(self):
# print('>>>正在初始化...')
self.YDMApi = windll.LoadLibrary('yundamaAPI')
self.appId = '软件id'
self.appKey = b'软件key'
# print('软件ID:%d\r\n软件密钥:%s' % (self.appId, self.appKey))
self.username = b'普通账号'
self.password = b'密码'
# 第一步:初始化云打码,只需调用一次即可
self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey)
# 第二步:登陆云打码账号,只需调用一次即可
self.uid = self.YDMApi.YDM_Login(self.username, self.password)
if self.username == b'test':
exit('\r\n>>>请先设置用户名密码')
####################### 一键识别函数 YDM_EasyDecodeByPath #######################
def esay_recognition(self, file_path):
# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1004
# 分配30个字节存放识别结果
result = c_char_p(b" ")
# 识别超时时间 单位:秒
timeout = 60
# 验证码文件路径
file_path = file_path.encode('utf-8')
filename = file_path
# 一键识别函数,无需调用 YDM_SetAppInfo 和 YDM_Login,适合脚本调用
captchaId = self.YDMApi.YDM_EasyDecodeByPath(self.username, self.password, self.appId, self.appKey, filename, codetype, timeout, result)
# print("一键识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value))
return result.value
########################## 普通识别函数 YDM_DecodeByPath #########################
def normal_recognition(self, file_path):
print('\r\n>>>正在登陆...')
if uid > 0:
balance = self.YDMApi.YDM_GetBalance(self.username, self.password)
print('\r\n>>>正在普通识别...')
codetype = 1004
result = c_char_p(b" ")
file_path = file_path.encode('utf-8')
filename = file_path
# 普通识别函数,需先调用 YDM_SetAppInfo 和 YDM_Login 初始化
captchaId = YDMApi.YDM_DecodeByPath(filename, codetype, result)
return result.value
import requests
from bs4 import BeautifulSoup
from RecognizerFile import Recgonizier
# 请求这个页面,获取验证码图片
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
# 处理登录
post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
s = requests.Session()
response = s.get(login_url,headers=headers)
response.encoding='utf-8'
soup = BeautifulSoup(response.text,'lxml')
img_url = 'https://so.gushiwen.org' + soup.select('#imgCode')[0].attrs.get('src')
# 注意要使用绘画请求图片
img_response = s.get(img_url)
with open('getCode.jpg','wb') as fp:
fp.write(img_response.content)
r = Recgonizier()
code = r.esay_recognition('getCode.jpg')
print('验证码识别结果'+str(code))
a = soup.select('#__VIEWSTATE')[0].attrs.get('value')
b = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')
print(a,b)
data = {
'__VIEWSTATE': a,
'__VIEWSTATEGENERATOR': b,
'from': 'http://so.gushiwen.org/user/collect.aspx',
'email': '邮箱@qq.com',
'pwd': '密码',
'code': code,
'denglu': '登录'
}
response = s.post(url=post_url,data=data,headers=headers)
response.encoding='utf-8'
with open('gushi.html','w',encoding='utf-8') as fp:
fp.write(response.text)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 模拟登陆qq邮箱
driver = webdriver.Chrome()
driver.get("https://mail.qq.com/")
time.sleep(5)
# 切换iframe
driver.switch_to.frame("login_frame")
# 用户名 密码
driver.find_element_by_name("u").send_keys("******@qq.com")
driver.find_element_by_name("p").send_keys("your password")
time.sleep(5)
driver.find_element_by_id("login_button").click()
# driver.find_element_by_id("login_button")send_keys(Keys.RETURN)
time.sleep(5)
print(driver.get_cookies())
driver.save_screenshot('qq.png')
driver.quit()
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
#模拟登录知乎
driver=webdriver.Chrome()
url='https://www.zhihu.com/signin'
driver.get(url)
driver.find_element_by_name('username').send_keys('189********')
time.sleep(3)
driver.find_element_by_name('password').send_keys('your password')
time.sleep(3)
driver.find_element_by_xpath('//form/button').click()
driver.save_screenshot('zhihu.png')
print(driver.get_cookies())
driver.quit()