国庆长假公司上了一个新项目,一直没有休息,10月9号开了爬虫班,事情真是一大堆。开班典礼和第一节课还是挺简单的,代码都没敲。第一节课主要讲了端口的概念,通讯协议,数据拆包,数据封包,HTTPS,HTTP的请求与相应,HTTP请求示例,爬虫的优势和爬虫的分类。
开始敲代码了。 get和post,url的组成(尤其是16进制的汉字编码)user-agent,爬虫和反爬虫机制。refer 状态码,抓包工具,下面是重点:urllib.request, urlopen,request(),read() urllib.parse urlencode 和decode
#方式1.1
import urllib.request
#url=“https://www.baidu.com/s?wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B”
import urllib.parse
import urllib.request
#搜索一个内容,把这个数据保存到本地html
baseurl=“https://www.baidu.com/s?”
key=input(“请输入要搜索的内容:”)
#进行编码
w= {“wd”:key}
k=urllib.parse.urlencode(w)
#拼接url
url=baseurl+k
#print(url)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘cookie’:‘BAIDUID=BD7E1E18524FFC27F134FC0750F2A3B8:FG=1; BIDUPSID=BD7E1E18524FFC27F134FC0750F2A3B8; PSTM=1588334179; BDUSS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDUSS_BFESS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; PSINO=3; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_PSSID=32814_32617_1443_32788_7544_32705_32230_7517_32116_32719_22159’
}
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html=res.read().decode(“utf-8”)
with open(“search.html”,“w”,encoding=“utf-8”) as f:
f.write(html)
#wd=方式
baseurl = ‘https://www.baidu.com/s?wd=’
key = input(‘请输入你要搜索的内容:’)
k = urllib.parse.quote(key)
url = baseurl + k
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,‘Cookie’:‘BIDUPSID=23F0C104655E78ACD11DB1E20FA56630; PSTM=1592045183; BD_UPN=12314753; sug=0; sugstore=0; ORIGIN=0; bdime=0; BAIDUID=23F0C104655E78AC9F0FB18960BCA3D3:SL=0:NR=10:FG=1; BDUSS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; BDUSS_BFESS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; MCITY=-158%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=6; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; COOKIE_SESSION=204_0_5_9_4_6_0_0_5_4_0_0_533_0_0_0_1602246393_0_1602250500%7C9%2369429_193_1601361993%7C9; H_PS_PSSID=32757_32617_1428_7566_7544_31660_32723_32230_7517_32116_32718; H_PS_645EC=ab4cD3QpA7yZJBKDrrzZqesHzhDrwV%2BYww0WVHtmGJ3Adcj0qvjZIVV%2F9q4’
}
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode(‘utf-8’)
with open(‘搜索3.html’,‘w’,encoding=‘utf-8’) as f:
f.write(html)
爬虫第二节有代码的课程,总第三节课。
时间过得真快,加上开班典礼,爬虫都上了4节课了。
本节课主要讲了urllib.parse 模块的用法,常用方法,get和post两种请求方式。
然后用代码,函数,类三种方式写了同一种代码。最后介绍了比较简单使用的request模块,响应方法,request设置代理,SSL。爬虫的代码比数据分析的长太多。估计一个笔记都写不完。
import urllib.request
import urllib.parse
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
#post查询参数和需要提交数据是隐藏在form表单里,不会在url地址上显示出来
#以bytes类型提交,不能是str
name=input(“请输入贴吧的名字:”)
begin=int(input(“请输入起始页:”))
end=int(input(“请输入结束页:”))
#从新赋值
kw={“kw”:name}
kw=urllib.parse.urlencode(kw)
#拼接url 发请求,获相应
for i in range(begin, end+1):
pn=(i-1)*50
#print(pn) “https://tieba.baidu.com/f?kw=%???&pn=0”
baseurl=“https://tieba.baidu.com/f?”
url=baseurl+kw+"&pn="+str(pn)
#print(url)
#发起请求
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
#写入文件
filename=“第”+str(i)+“页.html”
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)
import urllib.request
import urllib.parse
def readpage(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
return html
def writepage(filename,html):
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)
def main():
name=input("请输入贴吧的名字:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
#从新赋值
kw={"kw":name}
kw=urllib.parse.urlencode(kw)
for i in range(begin, end+1):
pn=(i-1)*50
#print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
baseurl="https://tieba.baidu.com/f?"
url=baseurl+kw+"&pn="+str(pn)
#调用函数
html=readpage(url)
filename="第"+str(i)+"页.html"
writepage(filename,html)
if name == “main”:
main()
import urllib.request
import urllib.parse
class BaiduSpider():
#把常用的不变的,放到init方法里面
def init(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
self.baseurl="https://tieba.baidu.com/f?"
def readpage(self,url):
req = urllib.request.Request(url, headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
def writepage(self,filename,html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print("write succussfully")
def main(self):
name = input("请输入贴吧的名字:")
begin = int(input("请输入起始页:"))
end = int(input("请输入结束页:"))
# 从新赋值
kw = {"kw": name}
kw = urllib.parse.urlencode(kw)
for i in range(begin, end + 1):
pn = (i - 1) * 50
# print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
#baseurl = "https://tieba.baidu.com/f?"
url = self.baseurl + kw + "&pn=" + str(pn)
# 调用函数
html = self.readpage(url)
filename = "第" + str(i) + "页.html"
self.writepage(filename, html)
if name == “main”:
#我们要调用main()方法,就需要实例化类
spider=BaiduSpider()
spider.main()
import urllib.request
import urllib.parse
import json
key=input(“请输入你要翻译的内容:”)
#拿到form表单的数据
data={
‘i’: key,
‘from’: ‘AUTO’,
‘smartresult’: ‘dict’,
‘client’: ‘fanyideskweb’,
‘salt’: ‘15880623642174’,
‘sign’: ‘c6c2e897040e6cbde00cd04589e71d4e’,
‘ts’: ‘1588062364217’,
‘bv’: ‘42160534cfa82a6884077598362bbc9d’,
‘doctype’: ‘json’,
‘version’: ‘2.1’,
‘keyfrom’: ‘fanyi.web’,
‘action’: ‘FY_BY_CLICKBUTTION’
}
data=urllib.parse.urlencode(data) #把data做编码转换
data=bytes(data,“utf-8”) # 做字节强制转换
url=“http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule”
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
req=urllib.request.Request(url,data=data,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(“utf-8”)
#print(html)
#把json类型的字符串,转换成python数据类型的字典
r_dict=json.loads(html)
#print(type(r_dict),r_dict)
r=r_dict[‘translateResult’][0][0][‘tgt’]
print®
import requests
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
#发送请求
response= requests.get(‘https://qq.yh31.com/zjbq/2920180.html’,headers=headers)
#打印相应对象
#print(response)
#print(response.text) # 返回的是str类型的数据。text本身是方法,
#print(response.content) #返回字节流数据
#解决乱码1- 最本质的方法
#print(response.content.decode(‘utf-8’))
#第二种解码
response.encoding=‘utf-8’
print(response.text)
import requests
import json
key=input(“请输入你要翻译的内容:”)
data={
‘i’: key,
‘from’: ‘AUTO’,
‘smartresult’: ‘dict’,
‘client’: ‘fanyideskweb’,
‘salt’: ‘15880623642174’,
‘sign’: ‘c6c2e897040e6cbde00cd04589e71d4e’,
‘ts’: ‘1588062364217’,
‘bv’: ‘42160534cfa82a6884077598362bbc9d’,
‘doctype’: ‘json’,
‘version’: ‘2.1’,
‘keyfrom’: ‘fanyi.web’,
‘action’: ‘FY_BY_CLICKBUTTION’
}
url=“http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule”
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
res=requests.post(url,data=data,headers=headers)
res.encoding=‘utf-8’
html=res.text
print(html)
import requests
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
#发送请求
response= requests.get(‘https://qq.yh31.com/zjbq/2920180.html’,headers=headers)
#打印相应对象
#print(response)
#print(response.text) # 返回的是str类型的数据。text本身是方法,
#print(response.content) #返回字节流数据
#解决乱码1- 最本质的方法
#print(response.content.decode(‘utf-8’))
#第二种解码
response.encoding=‘utf-8’
print(response.text)
第四节课,我正好在外地,完美的错过了讲课。今天才看录播。主要讲了cookie,通过在客户端记录信息来确定用户身份。session保持会话。这里举了12306的例子,非常难,需要对链接,图片进行处理,因为用的少,我也没有练习源代码。
之后讲了正则表达式,他的应用场景为爬虫和表单验证。match()参数的含义,元字符。
import requests
url=“https://www.zhihu.com/hot”
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’: ‘_zap=2b2da192-4a19-494c-ad1f-5958d7e7c2ed; d_c0=“AJCcbXXNNBGPTkkYDJqqrnc1aAb2_C-bCsw=|1588381296”; _ga=GA1.2.1921282150.1588381305; _xsrf=Bzdm8qxmJxdew35etcYJLyGZ4bnXKbTk; capsion_ticket=“2|1:0|10:1602914462|14:capsion_ticket|44:YmQzZWY3OWUzMWZkNDZhMTljOGU1Mzg1NmMxMDc2ZjQ=|74f0c75f6fe930b0c1b846c6f10fe2e40e3cfdf6ea208e1bee57b37c2f20959b”; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1601782524,1601782524,1601792284,1602914461; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1602914461; SESSIONID=PExwgztMg8NTHEIPaHNXFdNcmFUpNlmyRzAqHhR4Tuh; KLBRSID=53650870f91603bc3193342a80cf198c|1602914463|1602914461; JOID=VV4QBE7Zoo5GnXawfdlxkT-vC81uiOT0M_gNxzK8m8MQ1zH4LkQrBBmedLF9WUyPnHvzeZ8hPSWlMF-7pG5tJEY=; osd=Ul0dBE3eoYNGnnGzcNlyljyiC85pi-n0MP8OyjK_nMAd1zL_LUkrBx6debF-Xk-CnHj0epIhPiKmPV-4o21gJEU=’
}
res=requests.get(url,headers=headers)
print(res.text)
import requests
def query():
headers = {
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:’_uab_collina=159411739915355684359315; JSESSIONID=2CA548ACBAE3C9D5A89DEC11B114092C; tk=ozQ_C6DmGnfgkG6vVrIbf14VPmvctqELdZ3A_Q36w1w0; _jc_save_wfdc_flag=dc; BIGipServerotn=1307574794.50210.0000; BIGipServerpool_passport=283378186.50215.0000; RAIL_EXPIRATION=1603212845478; RAIL_DEVICEID=pwI3nLwCVl1tmaCqFY91TAPCGiPP1DvQ4ZZuyh9EoopLY3yIsXiOGZZi-JsbiazprFnKYmYXbLq8fzIxltxY-G7qJC8xekxUVfTWMfwVOS2LYogIbdwsZhZnJVHBOO7_GqRaxm5Ht9PLTgXkMYZACGTyxitObYXu; route=c5c62a339e7744272a54643b3be5bf64; _jc_save_toDate=2020-10-17; _jc_save_toStation=%u8D35%u9633%2CGIW; _jc_save_fromDate=2020-10-30; uKey=2bdff5e6696c3bf98d33828e2ccd10506890ad368d3f4da37deba18de81c3717; current_captcha_type=Z; _jc_save_fromStation=%u5408%u80A5%2CHFH’
}
response=requests.get("https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2020-10-30&leftTicketDTO.from_station=HFH&leftTicketDTO.to_station=GIW&purpose_codes=ADULT",headers=headers)
print(response.content.decode("utf-8"))
query()
import re
#match(pattern,string,flag=0) 第一个pattern 正则表达式,如果匹配成功,返回一个
#match对象,如果匹配失败则返回一个none。第二个string,表示要匹配的字符串
#第三个flag=0 标志位,用于控制正则表达式的匹配方式,是否区分大小写,是否换行匹配
pattern=‘python’
s=‘python and java’
result=re.match(pattern,s)
if result:
#print(result)
print(result.group())
print(result.start())
print(result.end())
print(result.span())
else:
print(“no data was found”)
#。 小数点可以匹配除了换行(\n)的任意一个字符
re.match(r’a.c’,‘avc’)
re.match(r’a.c’,‘avc’).group()
re.match(r’a.c’,‘a你c’).group()
#re.match(r’a.c’,‘a你好c’).group()
re.match(r’a|c’,‘a’).group()
re.match(r’a|c’,‘c’).group()
re.match(r’a|c’,‘ac’).group()
#re.match(r’a|b’,‘cba’).group() 不可以因为 match只找开头的
re.search(r’a|b’,‘cba’).group()
#匹配字符集中的一个字符
re.match(r’[abc]’,‘b’).group()
#re.match(r’[abc]2’,‘a’).group()
re.match(r’速度与激情[12345678]’,‘速度与激情12’).group()
#【^】 对字符集取反
re.match(r’速度与激情[^12345678]’,‘速度与激情0’).group()
re.match(r’速度与激情[^12345678]’,‘速度与激情01’).group()
re.match(r’速度与激情[1-8]’,‘速度与激情7’).group()
re.match(r’速度与激情[a-z]’,‘速度与激情w’).group()
#re.match(r’速度与激情[a-z]’,‘速度与激情W’).group()
#\ 对紧跟其后的一个字符进行转义
re.match(r’速度.与激情[a-z]’,‘速度.与激情w’).group()
re.match(r’速度.与激情[a-z]’,‘速度.与激情w’).group() # 注意这里的反斜杠
2020年10月18,今天周末,终于有时间把周五的课补习一下。
正则表达式第二节课。都是小的知识点,内容比较繁琐,用了jupyter加pycharm来教学。讲了对字符串做的一种逻辑过滤,应用场景为爬虫和表单验证,普通字符,元字符,预定义匹配字符集 \w, \d \s。 重复匹配,非贪婪匹配,re模块常用的方法,match,search,compile,findall,split,sub 分组功能。特殊场景表达式
#\d 匹配0-9 中任意一个数字
re.match(r’123’,‘123’).group()
re.match(r’[1]23’,‘123’).group()
re.match(r’[123]’,‘123’).group()
re.match(r’\d’,‘123’).group()
#\w 可以匹配 0-9 A-Z a-z _汉字 中任意一个
re.match(r’\w’,‘a123’).group()
re.match(r’\w’,‘0123’).group()
re.match(r’\w’,’_0123’).group()
re.match(r’\w’,‘你0123’).group()
#\s 制表符 可以匹配 空格,table 换页等空白字符的任意一个
re.match(r’\s’,’ ‘).group()
re.match(r’\s’,’\t’).group()
re.match(r’速度与激情\d’,‘速度与激情7’).group()
re.match(r’速度与激情\w’,‘速度与激情7’).group()
re.match(r’速度与激情\w’,‘速度与激情a’).group()
re.match(r’速度与激情\s’,‘速度与激情 1’).group()
#/D 是\d 的反集,非数字的任意一个字符
re.match(r’速度与激情\D’,‘速度与激情啊’).group()
#/W 是\w 的反集
re.match(r’速度与激情\W’,‘速度与激情$’).group()
re.match(r’\d{3}’,‘999’).group()
re.match(r’\d{11}’,‘18519018835’).group()
re.match(r’\d{3,4}-\d{7,8}’,‘0123-1234567’).group()
re.match(r’\d{3,4}-\d{7,8}’,‘012-1234567’).group()
re.match(r’\d{3,4}-\d{7,8}’,‘012-12345678’).group()
#{m,} 表示重复至少m次
re.match(r’\d{3,}-\d{7,8}’,‘01234-12345678’).group()
re.match(r’w[a-z]’,‘wedco’).group()
re.match(r’w[a-z]?’,‘wedco’).group()
re.match(r’w[a-z]?’,‘w’).group()
re.match(r’w[a-z]+’,‘wedco’).group()
re.match(r’w[a-z]+’,‘we’).group()
#re.match(r’w[a-z]+’,‘w’).group()
html_content=’’’ fasgsgggga
fasfgsdgs
fjtgitykiyuuyl
gdhdhjht
gfsdgsdg
‘’’
re.match(r’.’,html_content).group()
re.match(r’.’,html_content,re.S).group()
import re
s=r’
import re
#匹配数据 第一个参数,正则表达式的模板,第二个参数为数据(列表)
def fn(ptn,lst):
for x in lst:
result=re.match(ptn,x)
if result:
print(x,“it matches”,‘the results are:’, result.group())
else:
print(x,‘match fails’)
#lst=[“py2”,“py3”,“other”,“pyxxx”,“nba”]
#ptn=“py\d”
#ptn=“py\D”
#lst=[“hello world”,“hellodajia”,“hello,world”,“pyxxx”,“nba”]
#匹配 \s 空白
#ptn=‘hello\sworld’
#匹配 \S 非空白
#ptn=‘hello\Sworld’
#\w 匹配
#lst=[“1-age”,“a-age”,"#-age-","_-age",“美-age”]
#ptn=’\w-age’
#\W 匹配非单词字符
#ptn=’\W-age’
#*匹配一个字符,出现0次或者无限次
#+ 匹配前一个字符出现1次或者无限次
#{m,n} 匹配字符出现m到n次
#{m,}
lst=[“[email protected]”,“[email protected]”,“[email protected]”,"_xxx123",“12345678”]
#tn="[\w][email protected]"
#ptn="\[email protected]" # 不加中括号也可以
#ptn="\[email protected]$" #qq.com 就结束了
ptn="\[email protected]?"
fn(ptn,lst)
import re
text=“apple price is $99,orange price is $88”
#需求;寻找¥99 和¥88 .+表示匹配任意无限多个字符
#result=re.search(’.+$\d+.+$\d+’,text) #匹配所有
result=re.search(’.+($\d+).+($\d+)’,text)
#print(result.group())
print(result.groups()) #获取所有的分组
pat=re.compile(r’abc’)
pat.match(‘abc123’)
pat.match(‘abc123’).group()
pat=re.compile(r’abc’)
pat.match(‘ABC123’)
#pat.match(‘ABC123’).group() #匹配不到
pat=re.compile(r’abc’,re.I)
pat.match(‘ABC123’).group() #可以匹配
#search在文本里查找
re.search(r’abc’,‘123abcfmaingjshkaiuhabc900’).group()
#findall 返回一个匹配列表,并且匹配所有的 没有group属性
re.findall(r’abc’,‘123abcfmaingjshkaiuhabc900’)
re.findall(r’Abc’,‘123abcfmaingjshkaiuhabc900’) #返回空列表
s=“8+7*5+6/3”
re.findall(r’\d{1,}’,s)
re.findall(r’\d+’,s)
re.split(r’[+*/]’,s)
re.split(r’[+*/]’,s,maxsplit=2)
s=‘i am jerry i am very handsome! i like you’
r=re.sub(r’i’,‘I’,s) # 把小i替换成大i
r
10月20的课程代码比较少,就是做了正则表达式的练习以及xml的一些应用。
import requests
import re
url=“https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1603249247805_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1603249247806%5E00_2543X1297&sid=&word=%E7%BE%8E%E5%A5%B3”
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:‘BAIDUID=BD7E1E18524FFC27F134FC0750F2A3B8:FG=1; BIDUPSID=BD7E1E18524FFC27F134FC0750F2A3B8; PSTM=1588334179; BDUSS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDUSS_BFESS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=3; H_PS_PSSID=32814_1443_32872_32705_32230_7517_32116_22159; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=2k2la4al8h21alavai1fov8db0l; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=www.baidu.com; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm’
}
r=requests.get(url,headers=headers)
ret=r.text
#print(ret)
result=re.findall(’“objURL”:"(.*?)"’,ret)
#print(result)
for i in result:
#获取图片的名字,后10位
name=i[-10:]
#处理图片的名字,因为有的图片自带/放不进去img文件夹
name=re.sub("/","",name)
print(name)
#解决图片格式问题
end=re.search(r’(.jpg|.png|.jpeg|.gif)$’,name) # r要放在()外面
if end==None:
name=name+’.jpg’
with open('img/'+name,'wb') as f: #保存在当前目录,img文件夹下
#网络问题通过异常进行处理
try:
r=requests.get(i)
except Exception as e:
print(e)
#r=requests.get(i)
f.write(r.content)
#XML path language 通过元素和属性进行导航 【】 谓语,用来查找某个特定的节点或者包含某个特定值的节点
#一般谓语被嵌在【】
from lxml import etree
wb_data = “”"
#把wb data变成element 对象
html_element=etree.HTML(wb_data)
#获取li 标签下面的a标签的href
links=html_element.xpath("//li/a/@href")
print(links)
#获取a标签下面的文本数据
result=html_element.xpath(’//li/a/text()’)
print(result)
#把获取到的数据组合到字典中 一一对应
for link in links:
d={}
d[‘href’]=link
#获取下标索引值
#print(links.index(link))
d[‘title’]=result[links.index(link)]
print(d)
10月22的课程讲了csv的读写,换行
介绍了bs4,源码分析,使用,tag标签,navigablestring,comment,遍历文档树,遍历子节点
#csv的读写
import csv
titles=(‘name’,‘age’,‘height’)
persons=[(‘张三’,20,175),(‘李四’,22,178),(‘王五’,30,180),]
with open(“person.csv”,“r”,encoding=“utf-8”) as f:
reader=csv.DictReader(f)
for i in reader:
print(i[“name”])
#//div[@class=“info”]/div[@class=‘hd’]/a/span[@class=‘title’]/text()
#第二页:https://movie.douban.com/top250?start=25&filter=
#第三页:https://movie.douban.com/top250?start=50&filter=
#规律 (page-1)*25
import requests
from lxml import etree
import csv
#测试连接
doubanurl=“https://movie.douban.com/top250?start={}&filter=”
#获取网页源码
def getSource(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:‘DSID=AAO-7r5nXMym6lqYC0CQuKHfBMYUuFyKgO5zR8YXBDICiw_TMfaKmRajdoYbdQ2z15T2UoQ3R4fu_NLNptnImDvDKE5Gg3rxw1y7Kcp3XmRIriGCQdbMLjU; id=2289b75107c40002||t=1602377859|et=730|cs=002213fd489ca8a8699c62b057’
}
response=requests.get(url,headers=headers)
response.encoding=‘utf-8’
return response.text
#解析数据 电影的名字,评分,引言,详情页
def getEveryItem(source):
html_element=etree.HTML(source)
movieItemList=html_element.xpath('//div[@class="info"]')
#保存字典数据
movieList=[]
# 保存电影详情数据
for eachMovie in movieItemList:
movieDict={}
title=eachMovie.xpath("div[@class='hd']/a/span[ @class='title']/text()")
otherTitle=eachMovie.xpath("div[@class='hd']/a/span[ @class='other']/text()")
link = eachMovie.xpath('div[@class="hd"]/a/@href')[0]
star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0] #a[0]取值,就没有中括号了
quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')
#引言,有的就取,没有就不取
if quote:
quote=quote[0]
else:
quote=''
# 处理电影格式
movieDict['title']="".join(title+otherTitle)
movieDict['url']=link
movieDict['star']=star
movieDict['quote']=quote
movieList.append(movieDict)
print(movieList)
return movieList
def writeData(movieList):
with open(“douban.csv”, “w”, encoding=“utf-8”, newline="") as f:
writer=csv.DictWriter(f,fieldnames=[‘title’,‘star’,‘quote’,‘url’])
writer.writeheader()
for each in movieList:
writer.writerow(each)
#执行程序
if name==‘main’:
movieList=[]
for i in range(10):
#获取每一页的url
pageLink=doubanurl.format(i*25)
#获取每一页的源码:
source=getSource(pageLink)
#解析数据 movielist=movielist+geteveryitem
movieList+=getEveryItem(source)
#写入数据
writeData(movieList)
#import bs4
#tag 标签
#comment 注释
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
"""soup=BeautifulSoup(html_doc,features=“lxml”)
#print(soup)
#print(soup.prettify()) #漂亮的打印
soup=BeautifulSoup(html_doc,‘lxml’)
#遍历文档树 contents 返回一个所有子节点列表,children 返回一个子节点的迭代器
#descendants 返回一个生成器遍历子子孙孙
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
"""soup=BeautifulSoup(html_doc,‘lxml’)
for x in soup.descendants:
print(’----------’)
print(x)
周六的课程,代码都比较简单,主要是联系find findall, select的使用,BS确实好用啊
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
""" soup=BeautifulSoup(html_doc,"lxml")#stripped_string 和string 基本一致,但是他可以把多余的空格去掉
#parent 直接获得父节点
#parents获取所有的父节点
#print(soup.html.parent)
#next_siblings 下一个兄弟所有节点。previous_siblings 上一个兄弟所有节点
html=‘bbbccc’
soup2=BeautifulSoup(html,‘lxml’)
b_tag=soup2.b
a_tag=soup.find(id=‘link3’)
#print(a_tag)
for x in a_tag.previous_siblings:
print(x)
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
""" soup=BeautifulSoup(html_doc,"lxml") #此时此刻的这个a就代表的是字符串过滤器 # a_tag=soup.find('a') # 找到第一个A(符合标准的数据) # print(a_tag)#列表过滤器,我要找a标签和p标签
print(soup.find_all([‘title’,‘b’]))
‘’’
find_all()方法以列表形式返回所有的搜索到的标签数据
find()方法返回搜索到的第一条数据
‘’’
from bs4 import BeautifulSoup
html = “”"
职位名称 | 职位类别 | 人数 | 地点 | 发布时间 |
22989-金融云区块链高级研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-25 |
22989-金融云高级后台开发 | 技术类 | 2 | 深圳 | 2017-11-25 |
SNG16-腾讯音乐运营开发工程师(深圳) | 技术类 | 2 | 深圳 | 2017-11-25 |
SNG16-腾讯音乐业务运维工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-25 |
TEG03-高级研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
TEG03-高级图像算法研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
TEG11-高级AI开发工程师(深圳) | 技术类 | 4 | 深圳 | 2017-11-24 |
15851-后台开发工程师 | 技术类 | 1 | 深圳 | 2017-11-24 |
15851-后台开发工程师 | 技术类 | 1 | 深圳 | 2017-11-24 |
SNG11-高级业务运维工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
soup = BeautifulSoup(html,‘lxml’)
#获取tr标签
#2获取第二个tr标签
#tr=soup.find_all(‘tr’,limit=2)
#tr=soup.find_all(‘tr’,limit=2)[1]
#3获取所有class等于even的tr标签
#5 获取所有A标签里面的href属性
a=soup.find_all(‘a’)
for i in a:
href=i[‘href’]
#href=i.attrs[‘href’] # 第二种写法
print(href)
#6 获取所有的职位信息(文本数据)
trs=soup.find_all(‘tr’)[1:] #从第二个tr开始
for tr in trs:
tds=tr.find_all(‘td’)
jobname=tds[0].string #通过string来取值
print(jobname)
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
""" soup=BeautifulSoup(html_doc,"lxml")#1通过标签查找
#print(soup.select(‘a’))
#2通过类名来进行查找,则’.sister’=class=“sister”
#print(soup.select(’.sister’))
#3通过ID查找
#print(soup.select(’#link1’))
#print(soup.select(‘p #link1’))
#print(soup.select(‘head>title’))
#print(soup.select(‘a[href=“http://example.com/elsie”]’))#外层为单引号里面则为双引号
#6 获取内容
print(soup.select(‘title’)[0].get_text()) #列表,首先要通过[]取值,才能使用方法
print(soup.select(‘title’)[0].string) #法2
from bs4 import BeautifulSoup
html = “”"
职位名称 | 职位类别 | 人数 | 地点 | 发布时间 |
22989-金融云区块链高级研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-25 |
22989-金融云高级后台开发 | 技术类 | 2 | 深圳 | 2017-11-25 |
SNG16-腾讯音乐运营开发工程师(深圳) | 技术类 | 2 | 深圳 | 2017-11-25 |
SNG16-腾讯音乐业务运维工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-25 |
TEG03-高级研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
TEG03-高级图像算法研发工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
TEG11-高级AI开发工程师(深圳) | 技术类 | 4 | 深圳 | 2017-11-24 |
15851-后台开发工程师 | 技术类 | 1 | 深圳 | 2017-11-24 |
15851-后台开发工程师 | 技术类 | 1 | 深圳 | 2017-11-24 |
SNG11-高级业务运维工程师(深圳) | 技术类 | 1 | 深圳 | 2017-11-24 |
soup = BeautifulSoup(html,‘lxml’)
#1 获取所有的tr标签
#2 获取第二个tr标签
#3 获取所有class等于even的tr标签
#4 获取所有a标签的href属性
#5 获取所有的职位信息(文本信息)
trs=soup.select(‘tr’)
for tr in trs:
info=list(tr.stripped_strings) #如果只用string则换行tr.stripped_strings
#或者tr.strings均为生成器类型
print(info)
from bs4 import BeautifulSoup
html_doc = “”"
The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
""" soup=BeautifulSoup(html_doc,"lxml")#1 先找到
#------------
r=soup.find(class_=‘title’)
#print®
r.decompose()
print(soup)
爬虫代码实在是太长了,这是本博客最后一次更新,而后见python爬虫课程笔记 续。这节课主要讲了一个案例,然后梳理了爬虫的思路: 准备url(页面明确和页码不明确) 向url发起请求,添加随机UA,cookie,使用代理IP提取数据,确定数据位置(数据在不在当前url)提取的方式 xpath,bs
4,正则表达式。保存数据,数据库
#http://www.weather.com.cn/textFC/hb.shtml 华北
#http://www.weather.com.cn/textFC/hb.shtml 东北
import requests
import csv
from bs4 import BeautifulSoup
#定义表头
titles=(‘city’,‘temp’)
#定义一个函数来解析网页
def pares_page(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
response=requests.get(url,headers=headers)
#print(response.text)
text=response.content.decode('utf-8') #解决乱码问题
#soup=BeautifulSoup(text,'lxml')
#pip install html5lib
soup = BeautifulSoup(text, 'html5lib')
# 网页解析
#找conMidtab标签
conMidtab=soup.find('div',class_='conMidtab')
# 找table标签
tables=conMidtab.find_all('table')
#保存数据
lst=[]
for table in tables:
#print(table)
#print('-'*50)
#找到所有的tr标签,过滤掉前2个
trs=table.find_all('tr')[2:]
for index,tr in enumerate(trs):
#print(tr)
tds=tr.find_all('td')
city_td=tds[0]
if index==0:
city_td=tds[1]
info={}
city=list(city_td.stripped_strings)[0] #找城市
tds = tr.find_all('td')
temp_td = tds[-2]
temp= list(temp_td.stripped_strings)[0] # 找温度
info['city']=city
info['temp'] = temp
lst.append(info)
print('city:', city, 'temp:', temp)
return lst
# break #先打印北京的数据
def writeData(lst):
with open(‘citytemp.csv’,‘w’,encoding=‘utf-8’,newline=’’) as f:
writer=csv.DictWriter(f,titles)
writer.writeheader()
writer.writerows(lst)
def main():
lst=[]
#url=‘http://www.weather.com.cn/textFC/hb.shtml’
#url = ‘http://www.weather.com.cn/textFC/db.shtml’
#url=‘http://www.weather.com.cn/textFC/gat.shtml’
urls=[‘http://www.weather.com.cn/textFC/hb.shtml’,‘http://www.weather.com.cn/textFC/db.shtml’,‘http://www.weather.com.cn/textFC/gat.shtml’]
for url in urls:
lst+=pares_page(url)
#pares_page(url)
writeData(lst)
if name==‘main’:
main()
#下载工具的链接 phantomjs
#Mirror index of http://chromedriver.storage.googleapis.com/86.0.4240.22/
#http://npm.taobao.org/mirrors/chromedriver
from selenium import webdriver
#加载驱动
driver=webdriver.PhantomJS(executable_path=‘D:\Program Files\PyCharm Community Edition 2020.1.2\phantomjs.exe’)
#目前已经被弃用,但是很多老的项目还是用
#打开百度
driver.get(‘https://www.baidu.com/’)
#定位操作并输入内容,通过send_keys
driver.find_element_by_id(‘kw’).send_keys(‘python’) #输入框
driver.find_element_by_id(‘su’).click()
#print(driver.page_source) #查看源码
print(driver.current_url) #查看当前url
#截屏
driver.save_screenshot(‘baidu.png’)