开发工具pycharm, python3.8
用到的模块
fake_useragent 模块
requests 模块
使用xpath解析html
爬虫基本原理分析
书写代码爬取网络资源
```python
# @function:爬取电影top250
# @Description:一只萤火虫
# 清华大学镜像网站 https://pypi.tuna.tsinghua.edu.cn/simple/
# 安装 requests库文件
# 安装 fake-user库文件
import time
import requests
from fake_useragent import UserAgent
from lxml import html
def get_html(url):
user_agent = UserAgent() # 使用浏览器生成库
headers = {"user-agent": user_agent.random} # 随机生成浏览器
response = requests.get(url, headers=headers) # 获得响应对象
if response.status_code != 200:
raise Exception("请检查传入的url", url)
return response.text
if __name__ == "__main__":
url = "https://movie.douban.com/top250?start={}" # 需要解析的网页
etree = html.etree # 构建解析对象
for i in range(10):
html_str = get_html(url.format(i * 25))
html = etree.HTML(html_str)
# //*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[1]/a/span[1] ;网页标题,右键->检查->copy->复制xpath路径
# //*[@id="content"]/div/div[1]/ol/li[2]/div/div[2]/div[1]/a/span[1] ;对比发现一个li对应一个电影名字
# //*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/div/span[2] ; 评分
# 遍历每一个li标签,一个li据对应一个电影的信息
lis = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
j = 25 * i
for li in lis:
spans_info = li.xpath('./div/div[2]/div[1]/a/span[1]')
movie_title = " "
for span in spans_info:
movie_title += span.text
j += 1
# print("第{:2}个电影:{:18}".format(i, movie_title), end="\t")
spans_scores = li.xpath('./div/div[2]/div[2]/div/span[2]')
movie_score = ""
for span in spans_scores:
movie_score += span.text
# print("豆瓣:{:.4}分".format(movie_score))
print("第{:2}个电影:{:30}/豆瓣{}分".format(j, movie_title, movie_score))
time.sleep(5)
```