目录
1. 项目介绍
2. 网络爬虫
3. 爬虫结果
4. 项目源码
利用网页特征爬取演员如下百科信息:
- 演员名字
- 演员摘要介绍
- 演员基础信息
从打开某个人的百科网页可以看出上述三点内容所在的位置。具体可以参考最下面的开源代码。
Requests 是一个优雅而简单的Python HTTP库。
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.
利用 requests 爬取页面的 HTML,然后使用 beautiful soup 解析 HTML 获取目标数据。
安装依赖 requirements.txt
beautifulsoup4==4.12.2
requests==2.20.0
pip install -r requirements.txt
执行爬虫:
python main.py
爬虫主程序:
import json
import time
import random
from typing import Dict, Set
from utils.downloader import HTMLDownloader
from utils.manager import URLManager
from utils.parser import HTMLParser
class Spider:
"""
爬虫主程序
"""
FILE_PATH = "data/person.jsonl"
def __init__(self) -> None:
self._manager = URLManager()
self._parser = HTMLParser()
self._downloader = HTMLDownloader()
self._downloaded_urls = self._get_downloaded_urls()
def execute(self, init_url: str, target: int=20000) -> None:
self._execute(init_url, target)
def _execute(self, init_url: str, target: int) -> None:
"""
爬虫调度
"""
self._display("开始爬虫")
counter = 1
# 加入初始 url
self._manager.add_url(init_url)
# 如果 url 管理器中有 url 则一直执行
while self._manager.has_url():
url = self._manager.get_url()
self._display("第 {} 个:".format(counter), "*", 5)
print(url)
# 如果已经下载则跳过
if url in self._downloaded_urls and counter > 1:
self._display("已爬取", "*", 2)
continue
# 下载页面资源
page, url = self._downloader.download(url)
# 解析页面
try:
urls, data = self._parser.parse(page, url)
except AttributeError as e:
print(e)
continue
# 获取到的 url 添加到 url 管理器中
if urls:
self._manager.add_urls(urls)
# 保存爬取的结果
if data and url not in self._downloaded_urls:
self._save(data)
counter += 1
if counter > target:
break
self._sleep()
@staticmethod
def _display(text: str, symbol: str="=", num: int=20) -> None:
"""
格式化输出
"""
line = symbol * num \
+ " " + text + " " \
+ symbol * num
print(line)
@staticmethod
def _get_downloaded_urls() -> Set[str]:
"""
获取已下载结果
"""
with open(Spider.FILE_PATH, "r") as f:
data = [json.loads(i)["url"] for i in f.readlines() if i.strip()]
return set(data)
@staticmethod
def _sleep() -> None:
"""
随机休眠,模拟用户点击耗时
"""
time.sleep(random.random() * 5 + 1)
@staticmethod
def _save(data: Dict[str, str]) -> None:
"""
结果保存
"""
with open(Spider.FILE_PATH, "a") as f:
line = json.dumps(data, ensure_ascii=False)
f.write(line + "\n")
if __name__ == "__main__":
spider = Spider()
init_url = "https://baike.baidu.com/item/黄晓明/6597"
spider.execute(init_url)
URL 管理器:
from typing import Set
class URLManager:
"""
url 管理器
"""
def __init__(self) -> None:
# 待爬取的 url 集合
self._capture_urls = set()
# 已爬取的 url 集合
self._finished_urls = set()
def has_url(self) -> bool:
"""
判断待爬取集合,是否有 url
"""
return len(self._capture_urls) != 0
def get_url(self) -> str:
"""
从待爬取集合中获取 url
"""
url = self._capture_urls.pop()
self._finished_urls.add(url)
return url
def add_url(self, url: str) -> None:
"""
添加 url 到待爬取集合中
"""
if url and url not in self._capture_urls and url not in self._finished_urls:
self._capture_urls.add(url)
def add_urls(self, urls: Set[str]) -> None:
"""
批量添加 url 到待爬取集合中
"""
for url in urls:
self.add_url(url)
HTML 下载器:
from urllib import parse
from typing import Union, Tuple
import requests
class HTMLDownloader:
"""
HTML 下载器
"""
def __init__(self) -> None:
self._headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
def download(self, url: str) -> Union[Tuple[str, str], None]:
"""
网页下载
"""
if url:
r = requests.get(
url=url,
headers=self._headers
)
page = r.text
url = parse.unquote(r.url)
return page, url
HTML 解析器:
import re
from typing import Set, Dict, Tuple, Union
from urllib import parse
from bs4 import BeautifulSoup
class HTMLParser:
"""
HTML 解析器
"""
# 百家姓文件
DIR = "data/lastnames.txt"
# 演员关键词
ACTOR_KEYWORDS = [
"演员",
]
# 人物关键词
PERSON_KEYWORDS = [
"出生地",
"毕业院校",
"出生日期",
"国\xa0\xa0\xa0\xa0籍",
"民\xa0\xa0\xa0\xa0族",
"身\xa0\xa0\xa0\xa0高",
"星\xa0\xa0\xa0\xa0座",
"血\xa0\xa0\xa0\xa0型",
"生\xa0\xa0\xa0\xa0肖",
"性\xa0\xa0\xa0\xa0别",
]
# 名字长度限制
NAME_LENGTH = 4
def __init__(self) -> None:
self._soup = None
self._current_url = None
self._lastnames = self._get_lastnames()
def parse(self, page: str, url: str) -> Tuple[Set[str], Dict[str, str]]:
"""
解析网页
"""
if page and url:
self._soup = BeautifulSoup(page, "html.parser")
self._current_url = url
urls = self._get_page_urls()
data = self._get_page_data()
return urls, data
def _get_page_urls(self) -> Set[str]:
"""
获取页面中的 url, url可能是人名相关才添加
"""
urls = self._soup.find_all(
'a',
href=re.compile(r'/item/(\%\w{2})+')
)
url_set = set()
for url in urls:
rel_path = parse.unquote(url["href"])
if self._is_name(rel_path):
path = parse.urljoin(self._current_url, rel_path)
url_set.add(path)
return url_set
def _get_page_title(self) -> str:
"""
获取页面标题
"""
title = self._soup.find(
'dd',
class_='lemmaWgt-lemmaTitle-title'
).find('h1').get_text()
return title
def _get_page_summary(self) -> str:
"""
获取页面摘要
"""
summary = self._soup.find(
'div',
attrs={'label-module': 'lemmaSummary'},
class_='lemma-summary'
).get_text()
return summary
def _get_page_info(self) -> str:
"""
获取页面基本信息
"""
info = self._soup.find(
"div",
class_="basic-info"
).get_text()
return info
def _get_page_data(self) -> Union[Dict[str, str], None]:
"""
获取页面数据
"""
info = self._get_page_info()
summary = self._get_page_summary()
if self._is_person(info) and self._is_actor(summary):
data = dict()
data["title"] = self._get_page_title()
data["url"] = self._current_url
data["summary"] = summary
data["basic-info"] = info
return data
@staticmethod
def _get_lastnames() -> Set[str]:
"""
获取姓氏集合
"""
with open(HTMLParser.DIR, "r") as f:
data = [i.strip() for i in f.readlines() if i.strip()]
return set(data)
def _is_name(self, rel_path: str) -> bool:
"""
是否是名字
eg: /item/黄晓明/
"""
name = rel_path.split('/')[2]
top1, top2 = name[:1], name[:2]
if len(name) <= HTMLParser.NAME_LENGTH and top1 in self._lastnames or top2 in self._lastnames:
return True
return False
@staticmethod
def _is_person(info) -> bool:
"""
通过关键词判断页面是否是人物
"""
for key in HTMLParser.PERSON_KEYWORDS:
if key in info:
return True
return False
@staticmethod
def _is_actor(summary) -> bool:
"""
通过关键词判断页面是否是演员
"""
for key in HTMLParser.ACTOR_KEYWORDS:
if key in summary:
return True
return False
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
https://gitee.com/hl0929/baike-spider