抓包软件fiddler链接
文章安装链接:linux环境下Python3下的安装和配置(CentOS7系统)
由于现在再写以前python学习中的基础知识汇总 看不懂本篇的可以去看正在写的python基础内容
后期总结网络爬虫中所需库的各类函数详细分析 在jupyter notebook中编译注释各种函数使用和详解
模拟浏览器发送请求的库,python自带
python2:urllib urllib2
python3: urllib.request urllib.parse
字符串==》字节类型之间的转化
encode() 字符串==》字节类型
如果小括号里面不写参数,默认是utf8
如果你写,你就写gbk
decode() 字节类型==》字符串
如果不写 默认utf8
如果写 写gbk
import urllib.request
url='http://www.baidu.com'
response=urllib.request.urlopen(url)
#print(response.geturl())
#print(response.getheaders())
#print(response.getcode())
#print(response.readlines())
#print(response.read().decode())
#像图片只能写入本地二进制的格式
#with open('baidu.html','w',encoding='utf8') as fp:
#fp.write(response.read().decode())
#with open('baidu.html','wb') as fp:
#fp.write(response.read())
#urllib.request.urlretrieve(image_url,'test.jpt')
import urllib.parse
#url 只能由特定的字符组成,字母,数字,下划线
#如果出现其他的,比如 $ 空格 中文等 就要对其进行编码
#url='https://www.baidu.com/index.html?name=狗蛋&pwd=123456'
#ret=urllib.parse.quote(url)
#re=urllib.parse.unquote(ret)
#print(ret,re)
url='http://www.baidu.com/index.html'
#假如有参数 name age sex height
data={
'name'='狗蛋',
'age'=18,
'sex'='女',
'height'=180
}
'''
lt=[]
for k,v in data.items():
lt.append(k+'='+str(v))
query_string='&'.join(lt)
url=url+'?'+query_string
'''
query_string=urllib.parse.urlencode(data)
quote url编码函数,将中文进行转化为%xxx
unquote url解码函数,将%xxx转化为指定字符
urlencode 给一个字典,将字典拼接为quary_string,并且实现了编码的功能
import urllib.request
import urllib.parse
word=input('请输入你想要搜索的内容:')
url='http://www.baidu.com/s?'
data={
'ie':'utf-8',
'wd':word
}
query_string=urllib.parse.urlencode(data)
url+=query_string
response=urllib.request.urlopen(url)
filename=word+'.html'
with open(filename,'wb') as fp:
fp.write(response.read())
import urllib.request
import urllib.parse
url='http://www.baidu.com/'
#response=urllib.request.urlopen(url)
#print(response.read().decode())
#自己要伪装头部
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
#构建请求对象
rep=urllib.request.Request(url=url,headers=headers)
#发送请求
response=urllib.request.urlopen(rep)
print(response.read().decode())
伪装自己的UA,让服务器认为你是浏览器在上网
构建请求对象:urllib.request.Request()
import urllib.request
import urllib.parse
post_url='https://fanyi.baidu.com/sug'
# word=input('请输入你要查询的英文单词:')
word='baby'
#构建post表单数据
form_data={
'kw':word
}
#发送请求的过程
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
#构建请求对象
req=urllib.request.Request(url=post_url,headers=headers)
#处理post表单数据
form_data=urllib.parse.urlencode(form_data).encode()
#发送请求
response=urllib.request.urlopen(req,data=form_data)
print(response.read().decode())
【注意】表单数据的处理:
form_data=urllib.parse.urlencode(form_data).encode()
fiddler抓包,一个本上面有个箭头,代表就是post请求
import urllib.request
import urllib.parse
post_url='https://fanyi.baidu.com/v2transapi'
form_data={
'from':'en',
'to':'zh',
'query':'wolf',
'transtype':'realtime',
'simple_means_flag':3,
'sign':275695.55262,
'token':'4b0f3853511b419c662b0927f85723bd'
}
#fiddler抓包中的请求头带上,筛选有用的头信息
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
'Cookie': 'BAIDUID=6AD9F3298408D38EFC6F7B57B492336C:FG=1; PSTM=1564710881; BIDUPSID=82049E2562E471B0139D7EEFFD7E53E7; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; delPer=0; H_PS_PSSID=1429_21086_29522_29520_29098_29568_28837_29221_26350; PSINO=1; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1564630325,1565089581,1565184483; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1565186069; yjs_js_security_passport=65d91a6feb9235e8bf77ac03ee4a888c596e7f77_1565186092_js'
}
request=urllib.request.Request(url=post_url,headers=headers)
form_data=urllib.parse.urlencode(form_data).encode()
response=urllib.request.urlopen(request,form_data)
print(response.read().decode())
import urllib.request
import urllib.parse
url='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
page=int(input('请输入想要的第几页的数据'))
number=20
#构建get参数
data={
'start':(page-1)*number,
'limit':number
}
#将字典转换为query_string
query_string=urllib.parse.urlencode(data)
url+=query_string
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode())
import urllib.request
import urllib.parse
post_url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
city=input('请输入要查询的城市:')
page=input('请输入要查询第几页')
size=input('请输入要多少个')
formdata={
'cname':city,
'pid':'',
'pageIndex':page,
'pageSize':size
}
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req=urllib.request.Request(url=post_url,headers=headers)
formdata=urllib.parse.urlencode(formdata).encode()
response=urllib.request.urlopen(req,data=formdata)
print(response.read().decode())
import urllib.request
import urllib.parse
import os
url='http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'
ba_name=input('请输入要爬取的吧名:')
start_page=int(input('请输入要爬取的起始页码:'))
end_page=int(input('请输入要爬取的结束页码:'))
#创建文件夹
if not os.path.exists(ba_name):
os.mkdir(ba_name)
for page in range(start_page,end_page+1):
url_t=url.format(ba_name,(page-1)*50)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req=urllib.request.Request(url=url_t,headers=headers)
print('第%s页开始下载....'%page)
response=urllib.request.urlopen(req)
#生成文件名
filename=ba_name+'_'+str(page)+'.html'
filepath=ba_name+'/'+filename
with open(filepath,'wb') as fp:
fp.write(response.read())
print('第%s页结束下载.....'%page)
这两个类都在urllib.error里面
异常:NameError TypeError FileNotFound
异常处理:结构 try-except
URLError:
HTTPError:
是URLError的子类
【注】两个同时捕获的时候,需要将HTTPError写到上面,URLError写到下面
import urllib.request
import urllib.parse
import urllib.error
#url='http://www.maodan.com'
url='https://blog.csdn.net/m0_37622530/article/details/81257015'
try:
response = urllib.request.urlopen(url)
print(response)
except urllib.error.HTTPError as e:
print(e)
print(e.code)
except urllib.error.URLError as e:
print(e)
urlopen() 给一个url,发送请求,获取响应 不能定制请求头部
Request() 定制请求头,创建请求对象
高级功能:使用代理,cookie
基本用法:
import urllib.request
import urllib.parse
url='http://www.baidu.com'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
#创建一个handler
handler=urllib.request.HTTPHandler()
#通过handler创建一个opener
#opener就是一个对象。一会发送请求的时候,直接使用opener里面的方法即可,怒要使用urlopen
opener=urllib.request.build_opener(handler)
#构建请求对象
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
print(response.read().decode())
正向代理:代理客户端获取数据
反向代理:代理服务端提供数据
配置:
浏览器配置:设置>高级>代理>局域网设置>为lan使用代理
代码配置:
import urllib.request
import urllib.parse
#113.79.75.104:9797
#创建handler
handler=urllib.request.ProxyHandler({'http':'113.79.75.104:9797'})
#创建opener
opener=urllib.request.build_opener(handler)
url='https://www.baidu.com/s?&wd=ip'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req=urllib.request.Request(url,headers=headers)
res=opener.open(req)
with open('ip.html','wb') as fp:
fp.write(res.read())
http协议,无状态
网站登录时候的问题,用来记录用户身份的
模拟登录:
import urllib.request
import urllib.parse
import http.cookiejar
#真实的模拟浏览器,当发送完post请求的时候,将cookie保存到代码中
#创建一个cookiejar对象
cj=http.cookiejar.CookieJar()
#通过cookiejar创建一个handler
handler=urllib.request.HTTPCookieProcessor(cj)
#根据handler创建一个opener
opener=urllib.request.build_opener(handler)
post_url='http://eas.huat.edu.cn/default.aspx '
formdata={
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE':'Nv1tIU3IuGf31N311/BH0lZSTLaB0SM3Z4r+NqAA3cfDUgezuBDzQYIBItsSaShcdPgPql0PgpmgvjMY22c4xXZcEe1x75npfCzRa9VHnHzA8rcVTratbVAVjF3E1DsSGsrqMnIPXnKJyVr6712CgMOW9ur9Ay0NU/pafwvIr5KXcz72y7qSV6LTyh4zehOAvmlcl/qw3Y6GTGfSyzrQ9ZZsakFxqFxKayeNLDXbFtEq3urFZC34bF3Iw53a0WQ1a2Nj1px27sQEGTLV0ND7YKLrrOAdUKWh5DgLKHG2JFKRkriq32pxLVocmjW6ssDkTP+oexDsY9CCF7brZF1R6fKeYC7kwPiKf8TQAxa9HP89x5W+OHKraxhnTxGHiL34qyJVIkVDSlFZvIcprbDAquulqchhr5vbsZpWpMvyBhLSl4i4JkYQC+dUXZGre4RtKeRvTgOOedWmL491syhQg9/pN/wQ1SqyKcY5/7DYwtIRjAf+yMW8xT0B8zJkwcPzIdajOm/pbHkfIjXGYANRRtIIHHaXl0o0ebAnYNu1Kt6NgC+Z92pRIZahGj4HGG+eWJ9RDMwF/xkqej6YrLNevH5j1/33z08yV5l68h8XPN5iIZtg0FCgIi1Y50is1ooec8OP9D6UXmpiu9ukcaMQ4pKxxUj2ADz3gk1CTJN2+3BKETYvQan985Re84ubo93VcG2AumeDh7lhR3q+HxIU0Y3TIE6v3mWp5B86bpt1CuzLPrDp87Mj0F3oCGpN2Rqc9yA/2a5h6vOjC3OxtCyKcqq2k3zZfwuTp0bgTqq/ScErdx4K+KMRXsRe0pcAnx8TeCx7vU3pnoGNYGSZ/s8A23REyYDyRsvxjj7qVdE1VoDVOKmlTAS/BQ+zBPXf7W1fOm0B35NLAjht+JXFZ+1daHXfjsw+Sod4CUcEECEjQZ9DGO476eFS0O8AfXTIKdvj8bHzHVAsxKB9ypRa/75LdvCkJCVfT83/dK+Qv34UiII3m+rS9WElU/VmV6z5Om2sWsOJs+lMZjuHADVpBZlW/0Vnt9xwbx0EMYLSYGN/pp5jFCPjKBcd3W0IQNhUoJlvjSOJbKuui6uxM3CSBZf5/j5H9sVvjiW8sQkiT8OYHv4o5yo4GMYaUlTysi7RwROjI6rr00bt/JyxgyFHeCLuVPLGBAk3pMVSclvTda8fLSaY2PyzyBaXHhYc5S8whsPd7McVhW+DKJnn2+JOTudPXcQNUO9Je22INMwSljp0dKUzuuNvpG2heKFtF0/PwfZmAO1ucjZmYcenOGw3rNZVsrymj3ldegrZeNbD6Ez0JPmiesWeG7HlKAFdqTyd8ylg95MiJNm39nHsSA8yW5Nx6kXPTr5nyl9GiLDwuA3LL6fNiJAZXgbYfuzkhSfp1I0bC6r4zIxKdEh8YZmiTcBvvIam8vDrj0y34d7miPx4dX5FUczam7ok89Dqm8Kn8RJTl72vwHCBGFSmZQIcAN4HIw8/okBmbU4fBOoPEILmscwpceFn7lfYdPP4HlXsWvrySERRZUY2rRhKVx7JTm5kM/zITWfi0t0gZrRBUft/iPCRkPmGYPafwomnDaLzDHPoYJuewLqDzDweZS7COxGSiopKD/wwDU++0cbpOJDofxz1NOkQTJDNRB2peN9v41Vrw6Gok1i6g3PG/6kKWI1a/gaH1vTfYsfc8/FDHGc5SoRSxS/Q6X+fOrydU+2Oht1C7Y6359WkDeP6hZRkYfHKxg++/QXHGSKY8DLyqgo1QSsuTvcRQ5+8OAFTnh2TcqbQKonOvkPROdf8dLBbHemxawWmoi5HHykVQZKqdf839olyLykSuW0C5cL5Gx4K4eGyxd/0ImUEMljkGjPzk9V59QrxKMzW1PmZ3IB3Zy1Uo4fNKzDdlSGHd7c6EeSZoXiJmr3IVCbpf11FkkPZoHmivMzgZSCJ2gNIii+n6FIuKwnF+UCLN91wDKmBXo3iRDYqHhtWBc8aa0QnpsWNWHT/6UP6xbHPKIT4i6S4wVKLsKO8TbpocSm/mvl/xh+zXNj6+pxiCZ8liiwADTFDLOEXbIJuptj8y8v989h/+gT4XndG71EKoU7Vxy6ws07urmiDElQaPE5dYzC7PxjEnbmivD2+70vUvlMpQb9mF4Ca+Jvk9bnVsDZaB91SqfGBF4b5rf2+zL+kgtmSMGNPYQMUjwJgSvWortqxZNqTcrpmSeiZIbPHnrmTvq6yXdZ+7ojEP2/T9qImFYPFcTbHzFkF8Su7O5kdJgRjmXdwHWMP/kvXbYpR1mvjLhYR78uq80Y4Dv9BnvIwshbxEm6kdVU8ZC/K2teMDqGjIc+Kv+btTsSOdEZOwRU20+prGFeoMm6MDNNStzd6vAXRYVGSONVNskknkdi9rhN9INK9qI9xqx8X6kr0l+VwVeBfZ94ZYtdAtvmqFYEIVgW3aYxbkVzWGYex/FvwpmFHbaQz6yrEvlehSXu1gi9DIiv10shuXo8naWM4ftbyRZIpNxV2QOj2lC+6E/u2ZQoqysjbsF6FqUBURRW+mSOwRBAIaKEzcSfCT0JjfBs/xY3xyXYA6fNQXAdKnWep4y8l908aHdQaKKAqkEN4Dnejubll/UyjsCvR2cJLnXPpw9KYII6xKehLD4Cc4LpB5ZvwZGDMmBsXspp8XQ60t8Wog9ZpjdEAPDhSef0KGOSE2ex2IWEChV26LGPHnmYwoGUPOWSc6ENZ8iQTttgmNsPuZGXShoIUx3SUGGU0yFs/lu4vlTyFWzvpdRLKYQYXufDkXy9GsRNKO8yl01eNkFCVHTWID8KViY6pBfMmUuNCIJnt1/jp4wkJHgABW+OR5+8dIE0x8iWpPuHS+s/SeTlk3f0DtinX2j0hJXDgeowpl+T8osKnVOpp0myO/UBjiRCX7POzBrgpbGaWL4AwZzHvcNTKinJed7veUznboYWXk3L7NYAvzSUReFpmx/U6cNgvtlUI5em4+5Nlaifr/KXSv8Igzvp+XDJivHodw/NHqkI3HszV2oLFRC08K5O2Y5fl7RfC4hWnvAT7rJc2UkQkRPujnSq+f4u2xAQ58iYNTJ5sNJOeXIkdN+VfeuFIsinPJ2k/J0U7O8WabOx0xf6GZLwJmRMHSeOD1XJncTmAz4uIQR4PpZY+hRqAxkyOt3l07RZLYnfUKVXgb8ioSSoQ3kwZB6WNxLi7BSdJrZm4mbLHihB08vFTw/nXt0OYq3wKYbFNUWZ286UwAN9hOOKaq8kbcDPj5hhc8s0FIAc/uE2SviU7r3kRf9CyFzK8UpUF7lMOmBcfLFxL2TtRpzZpktCn/DScTHCP8EoD7fp8fiTxkItnzTA6HwkyQRYpa7nOvWCm8FdvVZNodpM1kzoh/TYSf5ATK3OagDdPwiYepDhcodcm3fKeWI7LTPf/cy2+PWysWXnib/svTSGL/crkfxVMqluS4o/qybuVFIXQW/oj2nmtJNeAWDKIWJPIpowJ7+DII6YaN5242T5CCDCxaKU+8INA7zXyCWq0YfktLmjsaDN5pq05whksrFS9aXEycE3lhDzO2YvpbM/JYISoV8S3WucAyA0jZFHVVSlZxjCHihNnalxjCwFvr7yLru1RhJHpPKRwRKmLtY8YTpcVM0J4Itnqpke13D3faiHiAAflbhMB8ZN37sVbrecfvGIrILVBVOTUD4At+zgQNwQRR3IcuJpo5ra4svXWIYtLfZk/mX1Y8dO9mkaIWpFN7baGsdpoRSqa4WwCJWpMtSqsFEleR6ALnbsi0mQcXZq9go+a02nI+0AOxuKthPgBDaXtsxHdVK8RLOHAQgfpmH4h6l4bOveP7NS/tqx423vN7o9KAJxNNAN9ntNmqMbAJHwVvqOXNqDDAXw7S51XQZ8zJY9kiaiSt6ZvJQ0CC0mGRRBy9k0iIj/JDXiirjYtLRHWA919IkYZ1zMuFCL2pCQQNtfiDgQGyhSKZR85hq8fXSrIRl3N1mY/25azMO2rLRiFg7M8N9Q0ajaZEF1QRi0aDXiHLga9xCy6IeMc60fvEesVY24eAWndprbXYwo/lV9yVS+yv0OT+tVw60Ajx/akhuu7Sf2PIL9mHWxEDf4W/hfKocfPVAJn3s1ifOTGWzh9PfgHk5CTjNVdaayT+e4CBCQHbRmeeIAh2NSHSfVWBQbvdTnl137M7W7w8BANOFMszfO5tHVe04EPawtee4N0QfvHaeguLzuz7I0sshVhLF4kBZPiUslYxyXPx44YHTZkBeqbv9SD16JyH6zmb+j+DUNyrTmyDgW0Gn5zkcnHgbzCPpveB4pn7j4H8DJ9gN08G1vIEAnfilatJ/1Jj0edEfR6P2fyqkO3ONekflywxwZhg0VSdlGBE71lhPg9PIEfYmX46wG/7ZO0gxhMIQGhIcIJq+A7nISw8fReFOpCdoMOlyjfgyuROpTbAAw/27+bRlsVLCQ1m0M9SUELaM1Rhqj01ixJuC9hqQuFRlkc7Zz2YS8NJ162FOSlEyZ9SJ/sLjqgd2GqervrhDSjxugn3U31IcfQgS3hJthjSgwKLadFPs0bCaL5N0dKfgViGEsaA48fCTtMAIjBLOAzRJgsL9jEDb44yDLTumcFIFkgnDHCOEdBl4Aka15GBXFmG09GamNyUNepDnfS//OcNYY+GgI3jSlLpp+e',
'__VIEWSTATEGENERATOR':'CA0B0334',
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION':'X5gBqKArFeHnk+mwZrETuquN93E38Vd1ZTwljAvk/jyMz1phqxuJXKV8vt6CU4xZ/SkinQZIU7kYA/Yv4uutcVP3emCxE5K/g3TFCAcTKXVrjvoYq5XJ5K0/EAhj40sEyDAQSFi1fqJNAsk4VlGgUB5arxUtl6jy6Bh4VcOwZk7zZlYgX51NAEqLzZHtLGx/',
'ctl00$txtUserID':'201600669',
'ctl00$txtUserPwd':'zcz980725',
'ctl00$btnLogin':'登 录'
}
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
req=urllib.request.Request(url=post_url,headers=headers)
formdata=urllib.parse.urlencode(formdata).encode()
res=opener.open(req,data=formdata)
#print(res.read().decode())
get_url='http://eas.huat.edu.cn/SelectResult/StudentCourseTable.aspx'
request=urllib.request.Request(url=get_url,headers=headers)
response=opener.open(request)
print(response.read().decode())
再往下所有的操作都是用opener.open方法去发送请求,因为这里面带着cookie过去了
用处:用来匹配一类具有相同规则字符串
. : 除换行以外所有字符
[] : 匹配集合中任意一个字符
\d : 数字
\D : 非数字
\w : 数字,字母,下划线,中文
\W : 非\w
\s : 所有空白字符
\S : 非空白
*:任意多次 >=0
+:至少一次 >=1
?:可有可无 0次或者1次
{m}:固定m次
{m,}:至少m次
{m,n}:至少m次,至多n次
\b \B
$:以某某结尾
^:以某某开头
():视为一个整体 例如(){4}
() 子模式\组模式 \1 \2 :
import re
string='猪八戒'
#\1 \2 等同于前面括号里的
pattern=re.compile(r'<(\w+)><(\w+)>\w+\2>\1>')
ret=pattern.search(string)
print(ret)
.*? .+?
re.I : 忽略大小写
re.M :多行匹配
re.S : .变为匹配任意字符
re.match:从开头找
re.search:从任意位置找第一个匹配的
re.findall:找到所有满足匹配的
re.sub(正则表达式,替换内容,字符串)
import re
def fn(a):
#print(a)
ret=int(a.group())
#print(ret)
return str(ret-10)
string='身高175'
pattern=re.compile(r'\d+')
ret=pattern.sub(fn,string)
print(ret)
re.compile:编译成正则对象
import re
import time
import urllib.request
import urllib.parse
import os
def handle_request(url,page):
url=url+str(page)+'/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
request=urllib.request.Request(url=url,headers=headers)
return request
def download_img(content):
pattern=re.compile(r'.*?.*?',re.S)
lt=pattern.findall(content)
for img_src in lt:
img_src='https:'+img_src
dirname='qiutu'
if not os.path.exists(dirname):
os.mkdir(dirname)
filename=img_src.split('/')[-1]
filepath=dirname+'/'+filename
print('%s图片正在下载......'%filename)
urllib.request.urlretrieve(img_src,filepath)
print('%s图片结束下载......'%filename)
time.sleep(1)
def main():
url='https://www.qiushibaike.com/pic/page/'
start_page=int(input('请输入起始页码:'))
end_page=int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
print('第%s页开始下载'%page)
req=handle_request(url,page)
content=urllib.request.urlopen(req).read().decode()
download_img(content)
print('第%s页结束下载'% page)
if __name__ == '__main__':
main()
import urllib.request
import urllib.parse
import re
def handle_request(url,page=None):
if page!=None:
url=url+str(page)+'.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
request=urllib.request.Request(url=url,headers=headers)
return request
def get_text(a_href):
request=handle_request(a_href)
content=urllib.request.urlopen(request).read().decode()
pattern=re.compile(r'(.*?)',re.S)
lt=pattern.findall(content)
text=lt[0]
pat=re.compile(r'')
text=pat.sub('',text)
return text
def parse_content(content):
pattern=re.compile(r'(.*?)
',re.S)
lt=pattern.findall(content)
for href_title in lt:
a_href='http://www.yikexun.cn'+href_title[0]
title=href_title[-1]
text=get_text(a_href)
string='%s
%s'%(title,text)
with open('lizhi.html','a') as fp:
fp.write(string)
def main():
url='http://www.yikexun.cn/lizhi/qianming/list_50_'
start_page=int(input('输入起始页码'))
end_page = int(input('输入结束页码'))
for page in range(start_page,end_page+1):
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode()
parse_content(content)
if __name__ == '__main__':
main()
from bs4 import BeautifulSoup
使用方式:可以将一个HTML文档,转化为指定的对象,然后通过对象的方法或者属性去查找指定的内容
生成对象soup
soup.a.attrs 获取所有的属性和值,返回一个字典
soup.a.attrs[‘href’] 获取href属性
soup.a[‘href’] 也可以简写为这种形式
soup.a.string
soup.a.text
sou.a.get_text()
如果标签还有标签,那么string获取的结果为None,而其他两个,可以获取文本内容
soup.find(‘a’) 找到第一个符合要求的a
soup.find(‘a’,title=‘xxx’)
soup.find(‘a’,alt=‘xxx’)
soup.find(‘a’,class_=‘xxx’)
soup.find(‘a’,id=‘xxx’)
find方法不仅soup可以调用,普通的div对象也能调用,会去指定的div里面去查找符合要求的节点
find找到的都是第一个符合要求的标签
根据选择器选择指定的内容
常见的选择器:标签选择器,类选择器,id选择器,组合选择器,层级选择器,伪类选择器,属性选择器
div >p >a > .lala 只能是下面一级 逐级的
select选择器返回永远是列表,需要通过下标提取指定的对象,然后获取属性和节点,该方法也可以通过普通对象调用,找到都是这个对象下面符合要求的所有节点
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
import json
import time
class ZhiLianSpider(object):
url='https://fe-api.zhaopin.com/c/i/sou?'
def __init__(self,jl,kw,start_page,end_page):
#将上面的参数都保存为自己的成员属性
self.jl=jl
self.kw=kw
self.start_page=start_page
self.end_page=end_page
self.items=[]
def handle_request(self,page):
data={
'start': (page-1)*90,
'pageSize':90,
'cityId':self.jl,
'kw':self.kw,
'kt':3
}
url_now=self.url+urllib.parse.urlencode(data)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
request=urllib.request.Request(url=url_now,headers=headers)
return request
def parse_content(self,content):
dit=json.loads(content)
data_lists=dit['data']['results']
for data_list in data_lists:
zwmc=data_list['jobName']
gsmc=data_list['company']['name']
zwyx=data_list['salary']
gzdd=data_list['city']['display']
gxsj=data_list['updateDate']
item={
'职位名称':zwmc,
'公司名称':gsmc,
'职位月薪':zwyx,
'工作地点':gzdd,
'更新时间':gxsj
}
self.items.append(item)
def run(self):
for page in range(self.start_page,self.end_page+1):
print('开始爬取第%s页'%page)
request=self.handle_request(page)
content=urllib.request.urlopen(request).read().decode()
self.parse_content(content)
print('结束爬取第%s页'% page)
time.sleep(2)
string=json.dumps(self.items,ensure_ascii=False)
with open('zhilian.txt','w',encoding='utf-8')as fp:
fp.write(string)
def main():
jl=input('请输入工作地点')
kw=input('请输入工作关键字')
start_page=int(input('请输入起始页码'))
end_page=int(input('请输入结束页码'))
spider=ZhiLianSpider(jl,kw,start_page,end_page)
spider.run()
if __name__ == '__main__':
main()
xml是用来存储和传输数据使用的和HTML的不同有两点:
HTML用来显示数据,xml用来传输数据
HTML标签是固定的,xml标签是自定义的
xpath用来在xml中查找指定的元素,他是一种路径表达式
//: 不考虑位置的查找
./: 从当前节点开始往下查找
@:选取属性
bookstore/book 选取根节点bookstore下面所有子节点book直接
//book 选取所有book
bookstore//book 查找bookstore下面所有的book(包括子节点和孙节点)
/bookstore/book[1] bookstore里面第一个book
/bookstore/book[last()] bookstore里面的最后一个book
/bookstore/book[position()❤️] bookstore里面的前面两个book
//title[@lang] 所有带lang属性的title
//title[@lang=‘eng’] 所有lang属性值为eng的title节点
*:任何元素节点
属性定位://input[@id=‘kw’] //input[@class=‘bg s_btn’]
层级定位://div[@id=‘head’]//a[@class=toindex]
索引定位://div[@id=‘head’]/div/div[2]/a[1] 索引从1开始
逻辑运算://input[@class=‘s_ipt’ and @name=‘wd’]
模糊匹配:
取文本:
//div[@id=‘u1’]/a[5]/text() 获取节点内容
//div[@id=‘u1’]/text() 获取节点里面不带标签的所有内容
//div[@id=‘u1’]//text() 获取节点里面所有标签的内容加不带标签的内容
ret=tree.xpath(’//div[@class=‘song’’]’)
string=ret[0].xpath(‘string(.)’)
string.replace(’\n’,’ ‘).replace(’\t’,’ ')
直接将所有的内容拼接起来
取属性://div[@id=‘u1’]/a[5]/@href
from lxml import etree
将html文档变成一个对象,然后调用对象的方法去查找指定的节点
两种方式使用:
ret=tree.xpath(‘路径表达式’) ret是一个列表
【注】:在对一个HTML中使用xpath找到对象后,再次使用xpath去找对象时需要加上‘.‘表示从第一次用xpath后对象的节点开始
存储的时候显示的\n是有效的,在读取过来单独使用的时候换行符生效
懒加载技术:用到时候再加载
实现方式:页面未显示的图片是src2或者data_src等等形式的图片,当滑动滚动条时由src2加载成src,页面一次性加载不了所有图片,所有采用这种技术,通过js实现,看源代码也可以找到真实src属性。
示例:
import urllib.request
import urllib.parse
from lxml import etree
import time
import os
def handle_request(url,page):
if page==1:
url=url.format('')
else:
url=url.format('_'+str(page))
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
request=urllib.request.Request(url=url,headers=headers)
return request
def parse_content(content):
tree=etree.HTML(content)
image_list=tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
for image_src in image_list:
download_img(image_src)
def download_img(image_src):
dirpath='xinggan'
if not os.path.exists(dirpath):
os.mkdir(dirpath)
filename=os.path.basename(image_src)
filepath=os.path.join(dirpath,filename)
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
request=urllib.request.Request(url=image_src,headers=headers)
# response=urllib.request.urlopen(request)
# with open(filepath,'wb')as fp:
# fp.write(response.read())
urllib.request.urlretrieve(image_src,filepath)
def main():
url='http://sc.chinaz.com/tupian/xingganmeinvtupian{}.html'
start_page=int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode()
parse_content(content)
time.sleep(2)
if __name__ == '__main__':
main()
pip install lxml
pip install jsonpath
json.dumps():将字典或者列表转化为json格式的字符串
json.loads():将json格式字符串转化为python对象
json.dump():将字典或者列表转化为json格式字符串并且写入到文件中
json.load():从文件中读入json格式字符串,转化为python对象
示例:
import json
lt=[
{'name':'王宝强','age':30},
{'name':'贾乃亮','age':36},
{'name':'马蓉蓉','age':33},
{'name':'宋吉吉','age':40},
{'name':'李小璐','age':43}
]
json.dump(lt,open('json.txt','w',encoding='utf-8'))
obj=json.load(open('json.txt','r',encoding='utf-8'))
print(obj)
前端处理:将json格式字符串转化为js对象
JSON.parse(‘json格式字符串’)
eval(’(’+json格式字符串+’)’)
浏览器自动化测试框架
from selenium import webdriver
import time
#模拟创建一个浏览器对象,然后通过对象去操作浏览器
browser=webdriver.Chrome()
url='http://www.baidu.com'
browser.get(url)
time.sleep(1)
#查找input输入框 element 是单个对象
my_input=browser.find_element_by_id('kw')
#往框里写文字
my_input.send_keys('美女')
time.sleep(1)
#查找搜索按钮 elements 返回的是列表 多个元素
button=browser.find_elements_by_class_name('s_btn')[0]
button.click()
time.sleep(1)
#找到指定图片点击
image=browser.find_elements_by_class_name('op-img-address-link-imgs')[4]
image.click()
#截屏
size=browser.get_screenshot_as_png()
with open('text.png','wb')as fp:
fp.write(size)
time.sleep(1)
#关闭浏览器,退出浏览器
browser.quit()
selenium+Chrome爬虫终极解决方案
无界面谷歌浏览器实现代码(headless Chrome):
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options=Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser=webdriver.Chrome(chrome_options=chrome_options)
url='http://www.baidu.com'
browser.get(url)
time.sleep(3)
browser.save_screenshot('baidu.png')
browser.quit()
requests:https://2.python-requests.org/en/master/
r=requests.get(url,headers=headers,params=data)
r.text 字符串形式查看响应
r.content 字节类型查看响应
r.endcoding
r.status_code 查看状态码
r.headers 查看响应头部
r.url 查看所请求的url
代理:
import requests
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
requests.get('http://example.org', proxies=proxies)
cookie:如果碰到会话相关的问题,要首先创建一个会话
s=requests.Session()
往下所有的操作都通过s进行访问,用s的方法
实例:今日头条视频下载
# import requests
# url='http://v3-tt.ixigua.com/febad87225b72e64385177e3687d8b98/5d551f4b/video/m/2202559d2a2e7c242fb9f088c1dcd27e1421162e73d30000a49ff7e52996/?rc=M3FvbXZ2O3VsbjMzNjczM0ApdSk2OzQzNTczNDg3ODg7PDNAKTtlZDU6ZTw8NzdnZ2k4aDNnKXUpQGczdSlAZjN1KTk0ZC5tb3NlLjIvbF8tLTUtL3NzOmkxNTAwNS0uLS0yMi4uLS4vaWI1YjZhYy42YS4tYzA0Nl46YzpiMHAjOmEtcCM6YDU0Og%3D%3D'
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
# r=requests.get(url=url,headers=headers)
# with open('1.mp4','wb')as fp:
# fp.write(r.content)
import os
import requests
from lxml import etree
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
def handle_href(a_href,title):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(a_href)
time.sleep(3)
tree=etree.HTML(browser.page_source)
video_src=tree.xpath('//div[@id="vs"]/video/@src')[0]
dr='shipin'
if not os.path.exists(dr):
os.mkdir(dr)
filepath = 'shipin/' + title + '.mp4'
print('%s开始下载'%title)
r=requests.get(video_src)
with open(filepath,'wb')as fp:
fp.write(r.content)
print('%s结束下载' % title)
def handle_title(widen):
url='http://www.365yg.com/api/pc/feed/?min_behot_time=0&category=video_new&utm_source=toutiao&widen={}&tadrequire=true&as=A1458DA525D163D&cp=5D55D156B32DCE1&_signature=1ZvtWxAUiNfc15dOysAzMNWb7U'
url=url.format(widen)
r=requests.get(url=url,headers=headers)
data=r.json()['data']
for video_data in data:
title=video_data['title']
a_href='http://www.365yg.com'+video_data['source_url']
handle_href(a_href,title)
def main():
for widen in range(1,2):
handle_title(widen)
if __name__ == '__main__':
main()
创建线程Thread
面向过程: t=threading.Thread(target=xxx,name=xxx,args=(xx,xx))
target:线程启动之后要执行的函数
name:线程的名字
获取线程名字:threading.current_thread().name
args:主线程向子线程传递参数
t.start():启动线程
t.join():让主线程等待子线程结束
面向对象:定义一个类,继承自threading.Thread,重写一个方法run方法,需要线程名字,传递参数,重写构造方法,在重写构造方法的时候,一定要注意手动调用父类的构造方法
线程同步:线程之间共享全局变量,
很容易发生数据的混乱问题,这个时候使用线程锁,抢,谁先抢到就先使用
创建锁
suo=threading.Lock()
上锁
suo.acquire()
释放锁
suo.release()