csv文件叫做:逗号分隔值文件,向Excel文件一样以行列的形式保存数据,保存数据的时候同一行的多列数据用逗号隔开。
1)csv文件读操作
from csv import reader, DictReader
reader
根据文件对象创建对应的reader,获取文件内容
with open('files/电影.csv', encoding='utf-8', newline='') as f:
r1 = reader(f)
print(r1) # <_csv.reader object at 0x0000026A83FDE140>
print(next(r1)) # ['电影名称', '评分', '评论数', '简介']
print(list(r1))
第一行作为键,读不出来
with open('files/电影.csv', encoding='utf-8', newline='') as f:
r2 = DictReader(f)
print(r2) #
print(next(r2)) # {'电影名称': '肖申克的救赎', '评分': '9.7', '评论数': '2675306人评价', '简介': '希望让人自由。'}
for x in r2:
print(x)
2)写操作
from csv import writer,DictWriter
with open('files/student1.csv', 'w', encoding='utf-8', newline='') as f:
# 1. 根据文件对象创建write对象
w1 = writer(f)
# 2. 写入数据
# 1)一次写一行
w1.writerow(['姓名', '性别', '年龄'])
w1.writerow(['小明', '男', '18'])
# 2)一次写多行
w1.writerows([
['小花', '女', 18],
['小蓝', '男','19']
])
with open('files/student2.csv', 'w', encoding='utf-8', newline='') as f:
# 1. 根据文件对象创建write对象
w2 = DictWriter(f, ['姓名', '性别', '年龄'])
# 2. 写入数据
# 1) 将字典的键作为第一行内容
w2.writeheader() # 没有这句,csv没有表头
# 2)一次写一行
w2.writerow({'姓名': '小明', '性别': '男', '年龄': 22})
# 3)一次写多行
w2.writerows([
{'姓名': '小红', '性别': '女', '年龄': 20},
{'姓名': '小黄', '性别': '男', '年龄': 21},
{'姓名': '小白', '性别': '女', '年龄': 21}
])
练习:将豆瓣TOP250所有的数据独取出来存进csv文件中
def get_one_page(start=0):
# 1. 获取网页数据
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
# print(html)
# 2. 解析数据
# 1)所有电影的名字
names = findall(r'
, html)
# 2)所有电影的上映时间、国家和类型
info = findall(r'(?s)(.+?)
', html)
info = [x.strip().split('\n')[-1].strip() for x in info]
times = []
countries = []
types = []
for x in info:
result = x.split(' / ')
times.append(result[0])
countries.append(result[1])
types.append(result[2])
# 3)评分
score = findall(r' ', html)
# 4)评论人数
comment = findall(r'(\d+)人评价', html)
data = map(lambda i1, i2, i3, i4, i5, i6: (i1, i2, i3, i4, i5, i6), names, score, comment, times, countries, types)
# print(list(data))
return list(data)
from csv import writer
if __name__ == '__main__':
with open('files/TOP250.csv', 'w', encoding='utf-8', newline='') as f:
w1 = writer(f)
w1.writerow(['电影名称', '评分', '评价人数', '上映年份', '国家', '类型'])
for x in range(0, 226, 25):
data = get_one_page(x)
with open('files/TOP250.csv', 'a', encoding='utf-8', newline='') as f:
w1 = writer(f)
w1.writerows(data)
语法:
选择器(属性名1: 属性值1; 属性名2: 属性值2;…)
常见属性:color(设置字体颜色)、background-color(背景颜色)、font-size(字体大小)、width(宽度)
选择器:
a{} - 选中所有的a标签
p{} - 选中所有的p标签
span{} - 选中所有的span标签
在id的属性前加**#**作为一个选择器,选中id属性值为指定值的标签
注意:一个网页中,id属性是唯一的
#a{} - 选中id属性值为a的标签
#b1{} - 选中id属性值为b1的标签
将两个选择器用**>**连接成一个选择器(前后形成父子关系)
div>a{} - 选中所有在div标签中的a标签
案例:
DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>title>
head>
<body>
<h1>标题1h1>
<a href="">我是超链接1a>
<p>我是段落1p>
<a href="">我是超链接2a>
<p>我是段落2p>
<span>我是span1span>
<a href="">我是超链接3a>
<style>
/*css*/
a{color:red;}
p{color: aquamarine;}
span{color: bisque;}
style>
<h1 id='i1'>标题1h1>
<a href="">我是超链接1a>
<p>我是段落1p>
<a href="">我是超链接2a>
<p id="p1">我是段落2p>
<span>我是span1span>
<a href="">我是超链接3a>
<style>
#p1{
background-color: aqua;
}
style>
<h1 class="c1">标题1h1>
<a href="">我是超链接1a>
<p class="c1 c2 c3">我是段落1p>
<a href="">我是超链接2a>
<p>我是段落2p>
<span class="c1">我是span1span>
<a href="" class="c2">我是超链接3a>
<style>
/* .c1{
color: aqua;
}
.c2{
color: blue;
font-size: 40px;
} */
.c1.c2{
color: chartreuse;
}
p.c1{
color: cornflowerblue;
}
style>
<a href="" class="c2">我是超链接1a>
<div>
<a href="">我是超链接2a>
div>
<p>
<a href="">我是超链接3a>
p>
<div>
<p class="c1">
<a href="">我是超链接4a>
<p class="c2">我是段落1p>
p>
div>
<style>
div>a{
color: aqua;
}
/* 只有 我是超链接2,4不是父子关系*/
.c1>a{
color: blueviolet;
}
.c1>.c2{
color: blue;
}
style>
<a href="" class="c2">我是超链接1a>
<div>
<a href="">我是超链接2a>
div>
<p>
<a href="">我是超链接3a>
p>
<div>
<p class="c1">
<a href="">我是超链接4a>
<p class="c2">我是段落1p>
p>
div>
<style>
div a{
color: red;
}
style>
body>
html>
注意:安装的时候装beautifulsoups,使用的时候应bs4
from bs4 import BeautifulSoup
BeautifulSoup(网页源代码, 'lxml)
f = open('files/data.html', encoding='utf-8')
soup = BeautifulSoup(f.read(), 'lxml')
f.close()
DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Titletitle>
head>
<body>
<p>我是段落1p>
<a class="c1">我是超链接1a>
<div>
<span>我是span1span>
<p class="c1">我是段落2p>
<div>
<a href="">我是超链接2a>
<p>
<span class="c2">我是span2span>
p>
div>
div>
<div id="box1">
<p title="abc">我是段落4p>
<a href="https://www.baidu.com">我是超链接3a>
<span>
<p xy="报错!">我是段落5p>
span>
div>
body>
html>
result = soup.select('p')
print(result) # 一个列表
result = soup.select('.c1')
print(result) # [我是超链接1, 我是段落2
]
result = soup.select_one('p')
print(result) # 我是段落1
result = soup.select('div p')
print(result)
box1 = soup.select_one('#box1')
print(box1) # 我是段落4
result = box1.select('p')
print(result) # [我是段落4
]
p1 = soup.select_one('span>p')
a1 = box1.select_one('a')
print(a1)
print(p1.text) # 我是段落5
print(a1.text) # 我是超链接3
print(a1.attrs['href']) # https://www.baidu.com
实例:利用bs4爬取豆瓣电影数据
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text
# 2.解释数据
soup = BeautifulSoup(html, 'lxml')
# 获取每个电影对应的div
div_li = soup.select('.grid_view>li>div')
for x in div_li:
name = x.select_one('.title').text
score = x.select_one('.rating_num').text
comment = x.select('.star>span')[-1].text
print(name, score, comment)
部分截图