【娱乐圈明星知识图谱1】百科爬虫

目录

1. 项目介绍

2. 网络爬虫

3. 爬虫结果

4. 项目源码


1. 项目介绍

利用网页特征爬取演员如下百科信息:

  1. 演员名字
  2. 演员摘要介绍
  3. 演员基础信息

从打开某个人的百科网页可以看出上述三点内容所在的位置。具体可以参考最下面的开源代码。

2. 网络爬虫

Requests 是一个优雅而简单的Python HTTP库。

Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.

利用 requests 爬取页面的 HTML,然后使用 beautiful soup 解析 HTML 获取目标数据。

安装依赖 requirements.txt

beautifulsoup4==4.12.2
requests==2.20.0
pip install -r requirements.txt

执行爬虫:

python main.py

爬虫主程序:

import json
import time
import random
from typing import Dict, Set

from utils.downloader import HTMLDownloader
from utils.manager import URLManager
from utils.parser import HTMLParser


class Spider:
    """
    爬虫主程序
    """
    FILE_PATH = "data/person.jsonl"

    def __init__(self) -> None:
        self._manager = URLManager()
        self._parser = HTMLParser()
        self._downloader = HTMLDownloader()
        self._downloaded_urls = self._get_downloaded_urls()

    def execute(self, init_url: str, target: int=20000) -> None:
        self._execute(init_url, target)

    def _execute(self, init_url: str, target: int) -> None:
        """
        爬虫调度
        """
        self._display("开始爬虫")
        counter = 1

        # 加入初始 url
        self._manager.add_url(init_url)
        # 如果 url 管理器中有 url 则一直执行
        while self._manager.has_url():
            url = self._manager.get_url()
            self._display("第 {} 个:".format(counter), "*", 5)
            print(url)

            # 如果已经下载则跳过
            if url in self._downloaded_urls and counter > 1:
                self._display("已爬取", "*", 2)
                continue

            # 下载页面资源
            page, url = self._downloader.download(url)

            # 解析页面
            try:
                urls, data = self._parser.parse(page, url)
            except AttributeError as e:
                print(e)
                continue

            # 获取到的 url 添加到 url 管理器中
            if urls:
                self._manager.add_urls(urls)

            # 保存爬取的结果
            if data and url not in self._downloaded_urls:
                self._save(data)
            counter += 1
            if counter > target:
                break
            self._sleep()

    @staticmethod
    def _display(text: str, symbol: str="=", num: int=20) -> None:
        """
        格式化输出
        """
        line = symbol * num \
            + " " + text + " " \
            + symbol * num
        print(line)

    @staticmethod
    def _get_downloaded_urls() -> Set[str]:
        """
        获取已下载结果
        """
        with open(Spider.FILE_PATH, "r") as f:
            data = [json.loads(i)["url"] for i in f.readlines() if i.strip()]
        return set(data)

    @staticmethod
    def _sleep() -> None:
        """
        随机休眠,模拟用户点击耗时
        """
        time.sleep(random.random() * 5 + 1)

    @staticmethod
    def _save(data: Dict[str, str]) -> None:
        """
        结果保存
        """
        with open(Spider.FILE_PATH, "a") as f:
            line = json.dumps(data, ensure_ascii=False)
            f.write(line + "\n")


if __name__ == "__main__":
    spider = Spider()
    init_url = "https://baike.baidu.com/item/黄晓明/6597"
    spider.execute(init_url)

URL 管理器:

from typing import Set


class URLManager:
    """
    url 管理器
    """

    def __init__(self) -> None:
        # 待爬取的 url 集合
        self._capture_urls = set()
        # 已爬取的 url 集合
        self._finished_urls = set()

    def has_url(self) -> bool:
        """
        判断待爬取集合,是否有 url
        """
        return len(self._capture_urls) != 0
    
    def get_url(self) -> str:
        """
        从待爬取集合中获取 url
        """
        url = self._capture_urls.pop()
        self._finished_urls.add(url)
        return url
    
    def add_url(self, url: str) -> None:
        """
        添加 url 到待爬取集合中
        """
        if url and url not in self._capture_urls and url not in self._finished_urls:
            self._capture_urls.add(url)

    def add_urls(self, urls: Set[str]) -> None:
        """
        批量添加 url 到待爬取集合中
        """
        for url in urls:
            self.add_url(url)

HTML 下载器:

from urllib import parse
from typing import Union, Tuple

import requests


class HTMLDownloader:
    """
    HTML 下载器
    """
    def __init__(self) -> None:
        self._headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
        }

    def download(self, url: str) -> Union[Tuple[str, str], None]:
        """
        网页下载
        """
        if url:
            r = requests.get(
                url=url, 
                headers=self._headers
            )
            page = r.text
            url = parse.unquote(r.url)
            return page, url
        

HTML 解析器:

import re
from typing import Set, Dict, Tuple, Union
from urllib import parse

from bs4 import BeautifulSoup


class HTMLParser:
    """
    HTML 解析器
    """
    # 百家姓文件
    DIR = "data/lastnames.txt"
    # 演员关键词
    ACTOR_KEYWORDS = [
        "演员",
    ]
    # 人物关键词
    PERSON_KEYWORDS = [
        "出生地",
        "毕业院校",
        "出生日期",
        "国\xa0\xa0\xa0\xa0籍",
        "民\xa0\xa0\xa0\xa0族",
        "身\xa0\xa0\xa0\xa0高",
        "星\xa0\xa0\xa0\xa0座",
        "血\xa0\xa0\xa0\xa0型",
        "生\xa0\xa0\xa0\xa0肖",
        "性\xa0\xa0\xa0\xa0别",
    ]
    # 名字长度限制
    NAME_LENGTH = 4

    def __init__(self) -> None:
        self._soup = None
        self._current_url = None
        self._lastnames = self._get_lastnames()

    def parse(self, page: str, url: str) -> Tuple[Set[str], Dict[str, str]]:
        """
        解析网页
        """
        if page and url:
            self._soup = BeautifulSoup(page, "html.parser")
            self._current_url = url
            urls = self._get_page_urls()
            data = self._get_page_data()
            return urls, data

    def _get_page_urls(self) -> Set[str]:
        """
        获取页面中的 url, url可能是人名相关才添加
        """
        urls = self._soup.find_all(
            'a',
            href=re.compile(r'/item/(\%\w{2})+')
        )
        url_set = set()
        for url in urls:
            rel_path = parse.unquote(url["href"])
            if self._is_name(rel_path):
                path = parse.urljoin(self._current_url, rel_path)
                url_set.add(path)
        return url_set

    def _get_page_title(self) -> str:
        """
        获取页面标题
        """
        title = self._soup.find(
            'dd',
            class_='lemmaWgt-lemmaTitle-title'
        ).find('h1').get_text()
        return title

    def _get_page_summary(self) -> str:
        """
        获取页面摘要
        """
        summary = self._soup.find(
            'div',
            attrs={'label-module': 'lemmaSummary'},
            class_='lemma-summary'
        ).get_text()
        return summary

    def _get_page_info(self) -> str:
        """
        获取页面基本信息
        """
        info = self._soup.find(
            "div",
            class_="basic-info"
        ).get_text()
        return info

    def _get_page_data(self) -> Union[Dict[str, str], None]:
        """
        获取页面数据
        """
        info = self._get_page_info()
        summary = self._get_page_summary()
        if self._is_person(info) and self._is_actor(summary):
            data = dict()
            data["title"] = self._get_page_title()
            data["url"] = self._current_url
            data["summary"] = summary
            data["basic-info"] = info
            return data

    @staticmethod
    def _get_lastnames() -> Set[str]:
        """
        获取姓氏集合
        """
        with open(HTMLParser.DIR, "r") as f:
            data = [i.strip() for i in f.readlines() if i.strip()]
        return set(data)

    def _is_name(self, rel_path: str) -> bool:
        """
        是否是名字 
        eg: /item/黄晓明/
        """
        name = rel_path.split('/')[2]
        top1, top2 = name[:1], name[:2]
        if len(name) <= HTMLParser.NAME_LENGTH and top1 in self._lastnames or top2 in self._lastnames:
            return True
        return False

    @staticmethod
    def _is_person(info) -> bool:
        """
        通过关键词判断页面是否是人物
        """
        for key in HTMLParser.PERSON_KEYWORDS:
            if key in info:
                return True
        return False
    
    @staticmethod
    def _is_actor(summary) -> bool:
        """
        通过关键词判断页面是否是演员
        """
        for key in HTMLParser.ACTOR_KEYWORDS:
            if key in summary:
                return True
        return False
    

3. 爬虫结果

{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}
{"title": "xxx", "url": "xxx", "summary": "xxx"}

4. 项目源码

https://gitee.com/hl0929/baike-spider

你可能感兴趣的:(项目,爬虫)