python爬虫课程笔记

11 Oct 2020

国庆长假公司上了一个新项目,一直没有休息,10月9号开了爬虫班,事情真是一大堆。开班典礼和第一节课还是挺简单的,代码都没敲。第一节课主要讲了端口的概念,通讯协议,数据拆包,数据封包,HTTPS,HTTP的请求与相应,HTTP请求示例,爬虫的优势和爬虫的分类。

第二节课

开始敲代码了。 get和post,url的组成(尤其是16进制的汉字编码)user-agent,爬虫和反爬虫机制。refer 状态码,抓包工具,下面是重点:urllib.request, urlopen,request(),read() urllib.parse urlencode 和decode

import requests

url=‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

req = requests.get(url)

fn=open(“code.png”,“wb”)

fn.write(req.content)

fn.close()

#方式一

import requests

url = ‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

req = requests.get(url)

fn = open(‘code.png’,‘wb’)

fn.write(req.content)

fn.close()

#方式1.1

with open(“code2.jpg”,“wb”) as f:

f.write(req.content)

#方式二

from urllib import request

url=‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

request.urlretrieve(url,“code3.png”)

import urllib.request

# #获取相应对象urlopen() 不支持重构user-agent

# response = urllib.request.urlopen(“https://www.baidu.com”)

# #用read()把对象的内容读取出来

# #decode() byte数据类型转化为str, encode 把str数据转为bytes数据

# html=response.read().decode(“utf-8”)

# print(type(html),html)

url=“https://www.baidu.com”

# headers 里建议加cookie

# headers={“User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36”}

headers = {

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’

}

# # 创建了请求的对象

req = urllib.request.Request(url,headers=headers)

#获取相应对象urlopen

res = urllib.request.urlopen(req)

html = res.read().decode(“utf-8”)

#print(html)

print(res.getcode()) # 返回状态码

print(res.geturl()) #返回我们实际请求的url

海贼王的16进制编码:%E6%B5%B7%E8%B4%BC%E7%8E%8B。 3个% 是一个汉字

#url=“https://www.baidu.com/s?wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B”
import urllib.parse
import urllib.request

te={“wd”:“海贼王”}

result=urllib.parse.urlencode(te)

print(result) # 打印结果就是海贼王的16进制编码

#搜索一个内容,把这个数据保存到本地html
baseurl=“https://www.baidu.com/s?”

key=input(“请输入要搜索的内容:”)
#进行编码
w= {“wd”:key}
k=urllib.parse.urlencode(w)

#拼接url
url=baseurl+k
#print(url)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘cookie’:‘BAIDUID=BD7E1E18524FFC27F134FC0750F2A3B8:FG=1; BIDUPSID=BD7E1E18524FFC27F134FC0750F2A3B8; PSTM=1588334179; BDUSS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDUSS_BFESS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; PSINO=3; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_PSSID=32814_32617_1443_32788_7544_32705_32230_7517_32116_32719_22159’
}
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html=res.read().decode(“utf-8”)

写入文件

with open(“search.html”,“w”,encoding=“utf-8”) as f:
f.write(html)

#wd=方式
baseurl = ‘https://www.baidu.com/s?wd=’

key = input(‘请输入你要搜索的内容:’)

k = urllib.parse.quote(key)

url = baseurl + k

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,‘Cookie’:‘BIDUPSID=23F0C104655E78ACD11DB1E20FA56630; PSTM=1592045183; BD_UPN=12314753; sug=0; sugstore=0; ORIGIN=0; bdime=0; BAIDUID=23F0C104655E78AC9F0FB18960BCA3D3:SL=0:NR=10:FG=1; BDUSS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; BDUSS_BFESS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; MCITY=-158%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=6; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; COOKIE_SESSION=204_0_5_9_4_6_0_0_5_4_0_0_533_0_0_0_1602246393_0_1602250500%7C9%2369429_193_1601361993%7C9; H_PS_PSSID=32757_32617_1428_7566_7544_31660_32723_32230_7517_32116_32718; H_PS_645EC=ab4cD3QpA7yZJBKDrrzZqesHzhDrwV%2BYww0WVHtmGJ3Adcj0qvjZIVV%2F9q4’
}

创建请求对象

req = urllib.request.Request(url,headers=headers)

获取响应对象

res = urllib.request.urlopen(req)

读取响应对象

html = res.read().decode(‘utf-8’)

写入文件

with open(‘搜索3.html’,‘w’,encoding=‘utf-8’) as f:
f.write(html)
爬虫第二节有代码的课程,总第三节课。
时间过得真快,加上开班典礼,爬虫都上了4节课了。
本节课主要讲了urllib.parse 模块的用法,常用方法,get和post两种请求方式。
然后用代码,函数,类三种方式写了同一种代码。最后介绍了比较简单使用的request模块,响应方法,request设置代理,SSL。爬虫的代码比数据分析的长太多。估计一个笔记都写不完。
import urllib.request
import urllib.parse

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}

主体架构 get 的特点,查询参数在url 地址中显示。post在request方法中添加data参数,表单数据。

#post查询参数和需要提交数据是隐藏在form表单里,不会在url地址上显示出来
#以bytes类型提交,不能是str
name=input(“请输入贴吧的名字:”)
begin=int(input(“请输入起始页:”))
end=int(input(“请输入结束页:”))
#从新赋值
kw={“kw”:name}
kw=urllib.parse.urlencode(kw)
#拼接url 发请求,获相应
for i in range(begin, end+1):
pn=(i-1)*50
#print(pn) “https://tieba.baidu.com/f?kw=%???&pn=0”
baseurl=“https://tieba.baidu.com/f?”
url=baseurl+kw+"&pn="+str(pn)
#print(url)
#发起请求
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
#写入文件
filename=“第”+str(i)+“页.html”
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)
import urllib.request
import urllib.parse

def readpage(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
return html

def writepage(filename,html):
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)

主函数

def main():

name=input("请输入贴吧的名字:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
#从新赋值
kw={"kw":name}
kw=urllib.parse.urlencode(kw)

for i in range(begin, end+1):
    pn=(i-1)*50
    #print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
    baseurl="https://tieba.baidu.com/f?"
    url=baseurl+kw+"&pn="+str(pn)
#调用函数
    html=readpage(url)
    filename="第"+str(i)+"页.html"
    writepage(filename,html)

if name == “main”:
main()
import urllib.request
import urllib.parse

class BaiduSpider():
#把常用的不变的,放到init方法里面
def init(self):

    self.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
    }
    self.baseurl="https://tieba.baidu.com/f?"




def readpage(self,url):
    req = urllib.request.Request(url, headers=self.headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    return html

def writepage(self,filename,html):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html)
        print("write succussfully")

def main(self):
    name = input("请输入贴吧的名字:")
    begin = int(input("请输入起始页:"))
    end = int(input("请输入结束页:"))
    # 从新赋值
    kw = {"kw": name}
    kw = urllib.parse.urlencode(kw)


    for i in range(begin, end + 1):
        pn = (i - 1) * 50
        # print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
        #baseurl = "https://tieba.baidu.com/f?"
        url = self.baseurl + kw + "&pn=" + str(pn)
        # 调用函数
        html = self.readpage(url)
        filename = "第" + str(i) + "页.html"
        self.writepage(filename, html)

if name == “main”:
#我们要调用main()方法,就需要实例化类
spider=BaiduSpider()
spider.main()
import urllib.request
import urllib.parse
import json

key=input(“请输入你要翻译的内容:”)
#拿到form表单的数据
data={
‘i’: key,
‘from’: ‘AUTO’,
‘smartresult’: ‘dict’,
‘client’: ‘fanyideskweb’,
‘salt’: ‘15880623642174’,
‘sign’: ‘c6c2e897040e6cbde00cd04589e71d4e’,
‘ts’: ‘1588062364217’,
‘bv’: ‘42160534cfa82a6884077598362bbc9d’,
‘doctype’: ‘json’,
‘version’: ‘2.1’,
‘keyfrom’: ‘fanyi.web’,
‘action’: ‘FY_BY_CLICKBUTTION’
}

data=urllib.parse.urlencode(data) #把data做编码转换

data=bytes(data,“utf-8”) # 做字节强制转换

url=“http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule”

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}

req=urllib.request.Request(url,data=data,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(“utf-8”)
#print(html)

#把json类型的字符串,转换成python数据类型的字典
r_dict=json.loads(html)
#print(type(r_dict),r_dict)
r=r_dict[‘translateResult’][0][0][‘tgt’]
print®

import requests

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
#发送请求
response= requests.get(‘https://qq.yh31.com/zjbq/2920180.html’,headers=headers)

#打印相应对象
#print(response)
#print(response.text) # 返回的是str类型的数据。text本身是方法,

这里做装饰器,所以不用加()注意这里有乱码, 解析的时候,去解码库里猜一个解码方式

#print(response.content) #返回字节流数据

#解决乱码1- 最本质的方法
#print(response.content.decode(‘utf-8’))

#第二种解码
response.encoding=‘utf-8’
print(response.text)

import requests
import json

key=input(“请输入你要翻译的内容:”)

data={
‘i’: key,
‘from’: ‘AUTO’,
‘smartresult’: ‘dict’,
‘client’: ‘fanyideskweb’,
‘salt’: ‘15880623642174’,
‘sign’: ‘c6c2e897040e6cbde00cd04589e71d4e’,
‘ts’: ‘1588062364217’,
‘bv’: ‘42160534cfa82a6884077598362bbc9d’,
‘doctype’: ‘json’,
‘version’: ‘2.1’,
‘keyfrom’: ‘fanyi.web’,
‘action’: ‘FY_BY_CLICKBUTTION’
}

url=“http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule”

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}

res=requests.post(url,data=data,headers=headers)
res.encoding=‘utf-8’
html=res.text
print(html)

import requests

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
#发送请求
response= requests.get(‘https://qq.yh31.com/zjbq/2920180.html’,headers=headers)

#打印相应对象
#print(response)
#print(response.text) # 返回的是str类型的数据。text本身是方法,

这里做装饰器,所以不用加()注意这里有乱码, 解析的时候,去解码库里猜一个解码方式

#print(response.content) #返回字节流数据

#解决乱码1- 最本质的方法
#print(response.content.decode(‘utf-8’))

#第二种解码
response.encoding=‘utf-8’
print(response.text)

第四节课,我正好在外地,完美的错过了讲课。今天才看录播。主要讲了cookie,通过在客户端记录信息来确定用户身份。session保持会话。这里举了12306的例子,非常难,需要对链接,图片进行处理,因为用的少,我也没有练习源代码。
之后讲了正则表达式,他的应用场景为爬虫和表单验证。match()参数的含义,元字符。
import requests

url=“https://www.zhihu.com/hot”

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’: ‘_zap=2b2da192-4a19-494c-ad1f-5958d7e7c2ed; d_c0=“AJCcbXXNNBGPTkkYDJqqrnc1aAb2_C-bCsw=|1588381296”; _ga=GA1.2.1921282150.1588381305; _xsrf=Bzdm8qxmJxdew35etcYJLyGZ4bnXKbTk; capsion_ticket=“2|1:0|10:1602914462|14:capsion_ticket|44:YmQzZWY3OWUzMWZkNDZhMTljOGU1Mzg1NmMxMDc2ZjQ=|74f0c75f6fe930b0c1b846c6f10fe2e40e3cfdf6ea208e1bee57b37c2f20959b”; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1601782524,1601782524,1601792284,1602914461; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1602914461; SESSIONID=PExwgztMg8NTHEIPaHNXFdNcmFUpNlmyRzAqHhR4Tuh; KLBRSID=53650870f91603bc3193342a80cf198c|1602914463|1602914461; JOID=VV4QBE7Zoo5GnXawfdlxkT-vC81uiOT0M_gNxzK8m8MQ1zH4LkQrBBmedLF9WUyPnHvzeZ8hPSWlMF-7pG5tJEY=; osd=Ul0dBE3eoYNGnnGzcNlyljyiC85pi-n0MP8OyjK_nMAd1zL_LUkrBx6debF-Xk-CnHj0epIhPiKmPV-4o21gJEU=’
}

res=requests.get(url,headers=headers)

print(res.text)

import requests

def query():
headers = {
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:’_uab_collina=159411739915355684359315; JSESSIONID=2CA548ACBAE3C9D5A89DEC11B114092C; tk=ozQ_C6DmGnfgkG6vVrIbf14VPmvctqELdZ3A_Q36w1w0; _jc_save_wfdc_flag=dc; BIGipServerotn=1307574794.50210.0000; BIGipServerpool_passport=283378186.50215.0000; RAIL_EXPIRATION=1603212845478; RAIL_DEVICEID=pwI3nLwCVl1tmaCqFY91TAPCGiPP1DvQ4ZZuyh9EoopLY3yIsXiOGZZi-JsbiazprFnKYmYXbLq8fzIxltxY-G7qJC8xekxUVfTWMfwVOS2LYogIbdwsZhZnJVHBOO7_GqRaxm5Ht9PLTgXkMYZACGTyxitObYXu; route=c5c62a339e7744272a54643b3be5bf64; _jc_save_toDate=2020-10-17; _jc_save_toStation=%u8D35%u9633%2CGIW; _jc_save_fromDate=2020-10-30; uKey=2bdff5e6696c3bf98d33828e2ccd10506890ad368d3f4da37deba18de81c3717; current_captcha_type=Z; _jc_save_fromStation=%u5408%u80A5%2CHFH’
}

response=requests.get("https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2020-10-30&leftTicketDTO.from_station=HFH&leftTicketDTO.to_station=GIW&purpose_codes=ADULT",headers=headers)

print(response.content.decode("utf-8"))

query()
import re

#match(pattern,string,flag=0) 第一个pattern 正则表达式,如果匹配成功,返回一个
#match对象,如果匹配失败则返回一个none。第二个string,表示要匹配的字符串
#第三个flag=0 标志位,用于控制正则表达式的匹配方式,是否区分大小写,是否换行匹配
pattern=‘python’
s=‘python and java’

result=re.match(pattern,s)

if result:
#print(result)
print(result.group())
print(result.start())
print(result.end())
print(result.span())
else:
print(“no data was found”)
#。 小数点可以匹配除了换行(\n)的任意一个字符
re.match(r’a.c’,‘avc’)
re.match(r’a.c’,‘avc’).group()
re.match(r’a.c’,‘a你c’).group()
#re.match(r’a.c’,‘a你好c’).group()
re.match(r’a|c’,‘a’).group()
re.match(r’a|c’,‘c’).group()
re.match(r’a|c’,‘ac’).group()
#re.match(r’a|b’,‘cba’).group() 不可以因为 match只找开头的
re.search(r’a|b’,‘cba’).group()

#匹配字符集中的一个字符
re.match(r’[abc]’,‘b’).group()
#re.match(r’[abc]2’,‘a’).group()
re.match(r’速度与激情[12345678]’,‘速度与激情12’).group()

#【^】 对字符集取反
re.match(r’速度与激情[^12345678]’,‘速度与激情0’).group()
re.match(r’速度与激情[^12345678]’,‘速度与激情01’).group()

定义【】的一个字符区间 例如【1-3】 【a-b】

re.match(r’速度与激情[1-8]’,‘速度与激情7’).group()
re.match(r’速度与激情[a-z]’,‘速度与激情w’).group()
#re.match(r’速度与激情[a-z]’,‘速度与激情W’).group()

#\ 对紧跟其后的一个字符进行转义
re.match(r’速度.与激情[a-z]’,‘速度.与激情w’).group()
re.match(r’速度.与激情[a-z]’,‘速度.与激情w’).group() # 注意这里的反斜杠

2020年10月18,今天周末,终于有时间把周五的课补习一下。
正则表达式第二节课。都是小的知识点,内容比较繁琐,用了jupyter加pycharm来教学。讲了对字符串做的一种逻辑过滤,应用场景为爬虫和表单验证,普通字符,元字符,预定义匹配字符集 \w, \d \s。 重复匹配,非贪婪匹配,re模块常用的方法,match,search,compile,findall,split,sub 分组功能。特殊场景表达式

#\d 匹配0-9 中任意一个数字
re.match(r’123’,‘123’).group()
re.match(r’[1]23’,‘123’).group()
re.match(r’[123]’,‘123’).group()
re.match(r’\d’,‘123’).group()

#\w 可以匹配 0-9 A-Z a-z _汉字 中任意一个
re.match(r’\w’,‘a123’).group()
re.match(r’\w’,‘0123’).group()
re.match(r’\w’,’_0123’).group()
re.match(r’\w’,‘你0123’).group()

#\s 制表符 可以匹配 空格,table 换页等空白字符的任意一个
re.match(r’\s’,’ ‘).group()
re.match(r’\s’,’\t’).group()
re.match(r’速度与激情\d’,‘速度与激情7’).group()
re.match(r’速度与激情\w’,‘速度与激情7’).group()
re.match(r’速度与激情\w’,‘速度与激情a’).group()
re.match(r’速度与激情\s’,‘速度与激情 1’).group()
#/D 是\d 的反集,非数字的任意一个字符
re.match(r’速度与激情\D’,‘速度与激情啊’).group()
#/W 是\w 的反集
re.match(r’速度与激情\W’,‘速度与激情$’).group()

重复匹配, {n} 表示重复n次

re.match(r’\d{3}’,‘999’).group()
re.match(r’\d{11}’,‘18519018835’).group()

重复匹配, {m,n} 表示重复至少m次,最多 n次

re.match(r’\d{3,4}-\d{7,8}’,‘0123-1234567’).group()
re.match(r’\d{3,4}-\d{7,8}’,‘012-1234567’).group()
re.match(r’\d{3,4}-\d{7,8}’,‘012-12345678’).group()
#{m,} 表示重复至少m次
re.match(r’\d{3,}-\d{7,8}’,‘01234-12345678’).group()

?表示一个字符出现0次或者1次

re.match(r’w[a-z]’,‘wedco’).group()
re.match(r’w[a-z]?’,‘wedco’).group()
re.match(r’w[a-z]?’,‘w’).group()

+ 匹配前一个字符出现1次或者无限次,至少得有1次

re.match(r’w[a-z]+’,‘wedco’).group()
re.match(r’w[a-z]+’,‘we’).group()
#re.match(r’w[a-z]+’,‘w’).group()

* 表达式出现0次到任意次

html_content=’’’ fasgsgggga
fasfgsdgs
fjtgitykiyuuyl
gdhdhjht
gfsdgsdg
‘’’
re.match(r’.’,html_content).group()
re.match(r’.
’,html_content,re.S).group()

贪婪匹配

import re
s=r’

abc
bcd

#需求匹配
#ptn=’
.
’ #贪婪匹配
ptn=’
.?
’ #非贪婪匹配
r=re.match(ptn,s)
print(r.group())

import re
#匹配数据 第一个参数,正则表达式的模板,第二个参数为数据(列表)
def fn(ptn,lst):
for x in lst:
result=re.match(ptn,x)
if result:
print(x,“it matches”,‘the results are:’, result.group())
else:
print(x,‘match fails’)

lst=[“abc1”,“ab”,“aba”,“abbcd”,“other”]

#匹配除了换行任意一个字符

ptn=‘ab.’

lst=[“man”,“mbn”,“mdn”,“mon”,“nba”]

匹配【】中的字符

ptn=“m[abcd]n”

#lst=[“py2”,“py3”,“other”,“pyxxx”,“nba”]

匹配\d

#ptn=“py\d”

匹配\D

#ptn=“py\D”

#lst=[“hello world”,“hellodajia”,“hello,world”,“pyxxx”,“nba”]
#匹配 \s 空白
#ptn=‘hello\sworld’

#匹配 \S 非空白
#ptn=‘hello\Sworld’

#\w 匹配
#lst=[“1-age”,“a-age”,"#-age-","_-age",“美-age”]

#ptn=’\w-age’
#\W 匹配非单词字符
#ptn=’\W-age’

#*匹配一个字符,出现0次或者无限次

lst=[“hello”,“abc”,“xxx”,“h”,“美-age”]

ptn=‘h[a-z]*’

#+ 匹配前一个字符出现1次或者无限次

lst=[“hello”,“abc”,“xxx”,“h”,“美-age”]

ptn=‘h[a-z]+’

匹配一个字符出现m次

lst=[“hello”,“python”,"$%&@",“123456”,“美12-age”,“美12_age”]

ptn="[\w]{6}"

#{m,n} 匹配字符出现m到n次

lst=[“abcd”,“python”,"$%&@","_xxx123",“12345678”]

ptn="[\w]{3,7}"

#{m,}

lst=[“ab”,“python”,"+$%&@","_xxx123",“123456789999”]

#ptn="[\w]{3,}"

ptn="\w{3,}" # 不加中括号也可以

lst=[“[email protected]”,“[email protected]”,“[email protected]”,"_xxx123",“12345678”]
#tn="[\w][email protected]"
#ptn="\[email protected]" # 不加中括号也可以
#ptn="\[email protected]$" #qq.com 就结束了
ptn="\[email protected]?"

fn(ptn,lst)

import re
text=“apple price is $99,orange price is $88”

#需求;寻找¥99 和¥88 .+表示匹配任意无限多个字符
#result=re.search(’.+$\d+.+$\d+’,text) #匹配所有
result=re.search(’.+($\d+).+($\d+)’,text)
#print(result.group())

print(result.group(1)) #获取第一个分组里面的内容

print(result.group(2)) #获取第2个分组里面的内容

print(result.group(0)) #默认是0,所以省略,提取全部

print(result.groups()) #获取所有的分组

pat=re.compile(r’abc’)
pat.match(‘abc123’)
pat.match(‘abc123’).group()

pat=re.compile(r’abc’)
pat.match(‘ABC123’)
#pat.match(‘ABC123’).group() #匹配不到

pat=re.compile(r’abc’,re.I)
pat.match(‘ABC123’).group() #可以匹配
#search在文本里查找
re.search(r’abc’,‘123abcfmaingjshkaiuhabc900’).group()
#findall 返回一个匹配列表,并且匹配所有的 没有group属性
re.findall(r’abc’,‘123abcfmaingjshkaiuhabc900’)
re.findall(r’Abc’,‘123abcfmaingjshkaiuhabc900’) #返回空列表
s=“8+7*5+6/3”
re.findall(r’\d{1,}’,s)
re.findall(r’\d+’,s)
re.split(r’[+*/]’,s)
re.split(r’[+*/]’,s,maxsplit=2)
s=‘i am jerry i am very handsome! i like you’
r=re.sub(r’i’,‘I’,s) # 把小i替换成大i
r

10月20的课程代码比较少,就是做了正则表达式的练习以及xml的一些应用。

在百度图片里爬取图片,首先把index改成flip,这样下面就会出现页面

import requests
import re

url=“http://pic.feizl.com/upload/allimg/170614/1QR95224-5.jpg”

with open(“1.jpg”,‘wb’) as f:

r=requests.get(url)

f.write(r.content)

1 拿到目标url地址,2 拿到网页源码 3 拿到所有的图片url地址 4 保存图片

url=“https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1603249247805_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1603249247806%5E00_2543X1297&sid=&word=%E7%BE%8E%E5%A5%B3”

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:‘BAIDUID=BD7E1E18524FFC27F134FC0750F2A3B8:FG=1; BIDUPSID=BD7E1E18524FFC27F134FC0750F2A3B8; PSTM=1588334179; BDUSS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDUSS_BFESS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=3; H_PS_PSSID=32814_1443_32872_32705_32230_7517_32116_22159; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=2k2la4al8h21alavai1fov8db0l; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=www.baidu.com; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm’

}

r=requests.get(url,headers=headers)
ret=r.text
#print(ret)
result=re.findall(’“objURL”:"(.*?)"’,ret)
#print(result)

for i in result:
#获取图片的名字,后10位
name=i[-10:]
#处理图片的名字,因为有的图片自带/放不进去img文件夹
name=re.sub("/","",name)
print(name)
#解决图片格式问题
end=re.search(r’(.jpg|.png|.jpeg|.gif)$’,name) # r要放在()外面
if end==None:
name=name+’.jpg’

with open('img/'+name,'wb') as f:  #保存在当前目录,img文件夹下
    #网络问题通过异常进行处理
    try:
        r=requests.get(i)
    except Exception as e:
        print(e)
    #r=requests.get(i)
    f.write(r.content)

文档节点, JK rowing 元素节点 lang=‘eng’ 为属性节点

常用节点选择工具 chrome插件 xpath helper

#XML path language 通过元素和属性进行导航 【】 谓语,用来查找某个特定的节点或者包含某个特定值的节点
#一般谓语被嵌在【】

import lxml

print(“hello”)

from lxml import etree

wb_data = “”"



  • first item

  • second item

  • third item

  • fourth item

  • fifth item


“”"

#把wb data变成element 对象
html_element=etree.HTML(wb_data)

#print(html_element)

result=etree.tostring(html_element)

r=result.decode()

#print(result) 打印的比较乱

print(type®,r)

#获取li 标签下面的a标签的href
links=html_element.xpath("//li/a/@href")
print(links)
#获取a标签下面的文本数据
result=html_element.xpath(’//li/a/text()’)
print(result)
#把获取到的数据组合到字典中 一一对应

for link in links:
d={}
d[‘href’]=link
#获取下标索引值
#print(links.index(link))
d[‘title’]=result[links.index(link)]
print(d)

10月22的课程讲了csv的读写,换行
介绍了bs4,源码分析,使用,tag标签,navigablestring,comment,遍历文档树,遍历子节点

#csv的读写
import csv

titles=(‘name’,‘age’,‘height’)
persons=[(‘张三’,20,175),(‘李四’,22,178),(‘王五’,30,180),]

一行一行的写

with open(“person.csv”,“w”,encoding=“utf-8”) as f:

writer=csv.writer(f)

writer.writerow(titles)

for data in persons:

writer.writerow(data)

#一次写多行

with open(“person.csv”,“w”,encoding=“utf-8”,newline="") as f:

#加入newline=“” 就没有空行了

writer=csv.writer(f)

writer.writerow(titles)

writer.writerows(persons)

#第二种方式

persons=[

{‘name’:‘张三’,‘age’:‘20’,‘height’:‘175’},

{‘name’:‘李四’,‘age’:‘20’,‘height’:‘175’},

{‘name’:‘王五’,‘age’:‘20’,‘height’:‘175’},

]

with open(“person.csv”,“w”,encoding=“utf-8”,newline="") as f:

writer=csv.DictWriter(f,titles)

writer.writeheader()

writer.writerows(persons)

with open(“person.csv”,“r”,encoding=“utf-8”) as f:

reader=csv.reader(f)

for i in reader:

print(i)

第二种读取方式

with open(“person.csv”,“r”,encoding=“utf-8”) as f:
reader=csv.DictReader(f)
for i in reader:
print(i[“name”])

爬取每部电影的名字,评分,引言,详情页的url ,10页,然后把数据保存到csv中

xpath //div[@class=“info”]

//div[@class=“info”]/div[@class=‘hd’]/a/span[@class=‘title’]

#//div[@class=“info”]/div[@class=‘hd’]/a/span[@class=‘title’]/text()

#第二页:https://movie.douban.com/top250?start=25&filter=
#第三页:https://movie.douban.com/top250?start=50&filter=
#规律 (page-1)*25
import requests
from lxml import etree
import csv

#测试连接

url=“https://movie.douban.com/top250?start=0&filter=”

headers = {

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,

‘Cookie’:‘DSID=AAO-7r5nXMym6lqYC0CQuKHfBMYUuFyKgO5zR8YXBDICiw_TMfaKmRajdoYbdQ2z15T2UoQ3R4fu_NLNptnImDvDKE5Gg3rxw1y7Kcp3XmRIriGCQdbMLjU; id=2289b75107c40002||t=1602377859|et=730|cs=002213fd489ca8a8699c62b057’

}

response=requests.get(url,headers=headers)

print(response.text)

doubanurl=“https://movie.douban.com/top250?start={}&filter=”
#获取网页源码

def getSource(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘Cookie’:‘DSID=AAO-7r5nXMym6lqYC0CQuKHfBMYUuFyKgO5zR8YXBDICiw_TMfaKmRajdoYbdQ2z15T2UoQ3R4fu_NLNptnImDvDKE5Gg3rxw1y7Kcp3XmRIriGCQdbMLjU; id=2289b75107c40002||t=1602377859|et=730|cs=002213fd489ca8a8699c62b057’
}
response=requests.get(url,headers=headers)
response.encoding=‘utf-8’
return response.text
#解析数据 电影的名字,评分,引言,详情页
def getEveryItem(source):

html_element=etree.HTML(source)
movieItemList=html_element.xpath('//div[@class="info"]')
#保存字典数据
movieList=[]
# 保存电影详情数据
for eachMovie in movieItemList:
    movieDict={}

    title=eachMovie.xpath("div[@class='hd']/a/span[ @class='title']/text()")
    otherTitle=eachMovie.xpath("div[@class='hd']/a/span[ @class='other']/text()")
    link = eachMovie.xpath('div[@class="hd"]/a/@href')[0]
    star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]  #a[0]取值,就没有中括号了
    quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')

    #引言,有的就取,没有就不取
    if quote:
        quote=quote[0]
    else:
        quote=''
    # 处理电影格式
    movieDict['title']="".join(title+otherTitle)
    movieDict['url']=link
    movieDict['star']=star
    movieDict['quote']=quote
    movieList.append(movieDict)
    print(movieList)
return movieList

保存数据

def writeData(movieList):
with open(“douban.csv”, “w”, encoding=“utf-8”, newline="") as f:
writer=csv.DictWriter(f,fieldnames=[‘title’,‘star’,‘quote’,‘url’])
writer.writeheader()
for each in movieList:
writer.writerow(each)
#执行程序
if name==‘main’:
movieList=[]
for i in range(10):
#获取每一页的url
pageLink=doubanurl.format(i*25)
#获取每一页的源码:
source=getSource(pageLink)
#解析数据 movielist=movielist+geteveryitem
movieList+=getEveryItem(source)

#写入数据
writeData(movieList)

#import bs4
#tag 标签
#comment 注释

navigable string 可导航的字符串

from bs4 import BeautifulSoup

html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""

soup=BeautifulSoup(html_doc,features=“lxml”)
#print(soup)
#print(soup.prettify()) #漂亮的打印

print(soup.title)

print(soup.title.name)

print(soup.title.string)

print(soup.p)

r=soup.find_all(“p”)

print(len®)

links=soup.find_all(‘a’)

for link in links:

#print(link)

print(link.get(‘href’))

soup=BeautifulSoup(html_doc,‘lxml’)

print(type(soup.title))

print(type(soup.p))

print(type(soup.a))

print(soup.title.name)

print(soup.p.name)

print(soup.p.attrs)

print(soup.p.string) #找文本

print(type())

#遍历文档树 contents 返回一个所有子节点列表,children 返回一个子节点的迭代器
#descendants 返回一个生成器遍历子子孙孙
from bs4 import BeautifulSoup
html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

"""

soup=BeautifulSoup(html_doc,‘lxml’)

head_tag=soup.head

print(head_tag.contents)

print(head_tag.children)

for i in head_tag.children:

print(i)

for x in soup.descendants:
print(’----------’)
print(x)

周六的课程,代码都比较简单,主要是联系find findall, select的使用,BS确实好用啊
from bs4 import BeautifulSoup
html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup=BeautifulSoup(html_doc,"lxml")

string 获取标签里面的内容,strings返回是一个生成器对象,用来获取多个标签内容

#stripped_string 和string 基本一致,但是他可以把多余的空格去掉

title_tag=soup.title

print(title_tag)

print(title_tag.string)

head_tag=soup.head

print(head_tag.string)

html_tag=soup.html

print(html_tag.string)

s=soup.strings

for i in s:

print(i)

s=soup.stripped_strings

#print(s)

for i in s:

print(i)

#parent 直接获得父节点
#parents获取所有的父节点

title_tag=soup.title

print(title_tag.parent)

#print(soup.html.parent)

a_tag=soup.a

# print(a_tag.parents)

for p in a_tag.parents:

print§

print(’-’*50)

next_sibling 下一个兄弟节点。previous_sibling 上一个兄弟节点

#next_siblings 下一个兄弟所有节点。previous_siblings 上一个兄弟所有节点
html=‘bbbccc’
soup2=BeautifulSoup(html,‘lxml’)

print(soup2)

print(soup2.prettify())

b_tag=soup2.b

print(b_tag)

print(b_tag.next_sibling)

print(b_tag.previous_sibling)

a_tag=soup.find(id=‘link3’)
#print(a_tag)
for x in a_tag.previous_siblings:
print(x)
from bs4 import BeautifulSoup
html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup=BeautifulSoup(html_doc,"lxml") #此时此刻的这个a就代表的是字符串过滤器 # a_tag=soup.find('a') # 找到第一个A(符合标准的数据) # print(a_tag)

a_tags=soup.find_all(‘a’) # 找到第一个A

print(a_tags)

#列表过滤器,我要找a标签和p标签

print(soup.find_all([‘p’,‘a’])) # 以列表的形式返回所有的搜索到的标签数据

print(soup.find_all([‘title’,‘b’]))

‘’’
find_all()方法以列表形式返回所有的搜索到的标签数据
find()方法返回搜索到的第一条数据
‘’’
from bs4 import BeautifulSoup

html = “”"

职位名称 职位类别 人数 地点 发布时间
22989-金融云区块链高级研发工程师(深圳) 技术类 1 深圳 2017-11-25
22989-金融云高级后台开发 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐运营开发工程师(深圳) 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐业务运维工程师(深圳) 技术类 1 深圳 2017-11-25
TEG03-高级研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG03-高级图像算法研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG11-高级AI开发工程师(深圳) 技术类 4 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
SNG11-高级业务运维工程师(深圳) 技术类 1 深圳 2017-11-24
"""

soup = BeautifulSoup(html,‘lxml’)
#获取tr标签

print(soup.tr)

print(soup.find(‘tr’))

trs=soup.find_all(‘tr’)

for tr in trs:

print(tr)

print(’-’*50)

#2获取第二个tr标签
#tr=soup.find_all(‘tr’,limit=2)
#tr=soup.find_all(‘tr’,limit=2)[1]

tr=soup.findAll(‘tr’) # findAll=find_all

tr=soup.findAll(‘tr’,limit=2)

print(tr)

#3获取所有class等于even的tr标签

trs=soup.findAll(‘tr’,class_=‘even’) #因为class是关键字,所以不能直接用

for tr in trs:

print(tr)

print(’-’*50)

trs=soup.find_all(‘tr’,attrs={‘class’:‘even’}) #attrs字典格式,无需特殊处理class

for tr in trs:

print(tr)

print(’-’*50)

4 将所有ID等于test class 等于test的a标签提取出来

r=soup.find_all(‘a’,id=‘test’,class_=‘test’)

for a in r:

print(a)

#5 获取所有A标签里面的href属性
a=soup.find_all(‘a’)
for i in a:
href=i[‘href’]
#href=i.attrs[‘href’] # 第二种写法
print(href)

#6 获取所有的职位信息(文本数据)
trs=soup.find_all(‘tr’)[1:] #从第二个tr开始
for tr in trs:
tds=tr.find_all(‘td’)
jobname=tds[0].string #通过string来取值
print(jobname)

from bs4 import BeautifulSoup
html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup=BeautifulSoup(html_doc,"lxml")

#1通过标签查找
#print(soup.select(‘a’))
#2通过类名来进行查找,则’.sister’=class=“sister”
#print(soup.select(’.sister’))
#3通过ID查找
#print(soup.select(’#link1’))

4组合查找

#print(soup.select(‘p #link1’))
#print(soup.select(‘head>title’))

5 通过属性查找

#print(soup.select(‘a[href=“http://example.com/elsie”]’))#外层为单引号里面则为双引号
#6 获取内容
print(soup.select(‘title’)[0].get_text()) #列表,首先要通过[]取值,才能使用方法
print(soup.select(‘title’)[0].string) #法2

编程序以复习为主

from bs4 import BeautifulSoup

html = “”"

职位名称 职位类别 人数 地点 发布时间
22989-金融云区块链高级研发工程师(深圳) 技术类 1 深圳 2017-11-25
22989-金融云高级后台开发 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐运营开发工程师(深圳) 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐业务运维工程师(深圳) 技术类 1 深圳 2017-11-25
TEG03-高级研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG03-高级图像算法研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG11-高级AI开发工程师(深圳) 技术类 4 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
SNG11-高级业务运维工程师(深圳) 技术类 1 深圳 2017-11-24
"""

soup = BeautifulSoup(html,‘lxml’)

#1 获取所有的tr标签

trs=soup.select(‘tr’)

print(trs)

#2 获取第二个tr标签

tr=soup.select(‘tr’)[1]

print(tr)

#3 获取所有class等于even的tr标签

trs=soup.select(’.even’)

print(trs)

#4 获取所有a标签的href属性

alst=soup.select(‘a’)

for a in alst:

href=a[‘href’]

print(href)

#5 获取所有的职位信息(文本信息)
trs=soup.select(‘tr’)
for tr in trs:
info=list(tr.stripped_strings) #如果只用string则换行tr.stripped_strings
#或者tr.strings均为生成器类型
print(info)

from bs4 import BeautifulSoup
html_doc = “”"

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup=BeautifulSoup(html_doc,"lxml")

#1 先找到

tag_p=soup.p

print(tag_p)

tag_p.name=‘w’ # 修改标签名称

tag_p[‘class’]=‘content’

print(tag_p)

#------------

tag_p=soup.p

print(tag_p)

tag_p.string=‘亲爱的同学们’

print(tag_p.string)

tag_p=soup.p

tag_p.append(‘you need python’) # 添加字符串

print(tag_p)

r=soup.find(class_=‘title’)
#print®
r.decompose()
print(soup)

爬虫代码实在是太长了,这是本博客最后一次更新,而后见python爬虫课程笔记 续。这节课主要讲了一个案例,然后梳理了爬虫的思路: 准备url(页面明确和页码不明确) 向url发起请求,添加随机UA,cookie,使用代理IP提取数据,确定数据位置(数据在不在当前url)提取的方式 xpath,bs
4,正则表达式。保存数据,数据库

a=[‘1’,‘2’,‘3’]

b=’-’.join(a)

print(b)

c=‘hello’

d=’*’.join©

print(d)

天气数据的爬取,需求:爬取全国的城市以及对应的温度(省会,直辖市),保存到csv中

#http://www.weather.com.cn/textFC/hb.shtml 华北
#http://www.weather.com.cn/textFC/hb.shtml 东北
import requests
import csv
from bs4 import BeautifulSoup

#定义表头
titles=(‘city’,‘temp’)
#定义一个函数来解析网页
def pares_page(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’

}
response=requests.get(url,headers=headers)
#print(response.text)
text=response.content.decode('utf-8') #解决乱码问题
#soup=BeautifulSoup(text,'lxml')
#pip install html5lib
soup = BeautifulSoup(text, 'html5lib')
# 网页解析
 #找conMidtab标签
conMidtab=soup.find('div',class_='conMidtab')
# 找table标签
tables=conMidtab.find_all('table')

#保存数据
lst=[]

for table in tables:
    #print(table)
    #print('-'*50)
    #找到所有的tr标签,过滤掉前2个
    trs=table.find_all('tr')[2:]
    for index,tr in enumerate(trs):
        #print(tr)
        tds=tr.find_all('td')
        city_td=tds[0]
        if index==0:
            city_td=tds[1]
        info={}
        city=list(city_td.stripped_strings)[0] #找城市
        tds = tr.find_all('td')
        temp_td = tds[-2]
        temp= list(temp_td.stripped_strings)[0]  # 找温度

        info['city']=city
        info['temp'] = temp
        lst.append(info)
        print('city:', city, 'temp:', temp)
return lst
   # break  #先打印北京的数据

def writeData(lst):
with open(‘citytemp.csv’,‘w’,encoding=‘utf-8’,newline=’’) as f:
writer=csv.DictWriter(f,titles)
writer.writeheader()
writer.writerows(lst)

def main():
lst=[]
#url=‘http://www.weather.com.cn/textFC/hb.shtml’
#url = ‘http://www.weather.com.cn/textFC/db.shtml’
#url=‘http://www.weather.com.cn/textFC/gat.shtml’
urls=[‘http://www.weather.com.cn/textFC/hb.shtml’,‘http://www.weather.com.cn/textFC/db.shtml’,‘http://www.weather.com.cn/textFC/gat.shtml’]
for url in urls:
lst+=pares_page(url)
#pares_page(url)
writeData(lst)

if name==‘main’:
main()
#下载工具的链接 phantomjs
#Mirror index of http://chromedriver.storage.googleapis.com/86.0.4240.22/
#http://npm.taobao.org/mirrors/chromedriver
from selenium import webdriver
#加载驱动
driver=webdriver.PhantomJS(executable_path=‘D:\Program Files\PyCharm Community Edition 2020.1.2\phantomjs.exe’)
#目前已经被弃用,但是很多老的项目还是用

#打开百度
driver.get(‘https://www.baidu.com/’)
#定位操作并输入内容,通过send_keys
driver.find_element_by_id(‘kw’).send_keys(‘python’) #输入框
driver.find_element_by_id(‘su’).click()
#print(driver.page_source) #查看源码
print(driver.current_url) #查看当前url
#截屏
driver.save_screenshot(‘baidu.png’)

你可能感兴趣的:(Python,python,大数据)