Python爬虫WB用户

utils模块我自己写是创建headers的,自己搞一个吧。这行删了

import json
import os

import requests
import urllib.request
from pathlib import Path
from utils import make_headers


class WeiboUserCrawler:
    def __init__(self, user_id, path="weibo", proxy_addr="122.241.72.191:808"):
        self.user_id = user_id
        self.path = path
        self.proxy_addr = proxy_addr
        self.url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + self.user_id

        self.container_id = self.get_container_id(self.url)
        self.page = 1  # 微博页数
        self.pic_num = 0  # 图片编号
        self.flag = True
        self.create_file()

    def create_file(self):
        if not os.path.exists(self.path):
            os.makedirs(self.path)
        if not os.path.exists(self.path + "/" + self.user_id):
            os.makedirs(self.path + "/" + self.user_id)

    def use_proxy(self, url):
        req = urllib.request.Request(url)
        req.add_header("User-Agent", make_headers()["User-Agent"])
        proxy = urllib.request.ProxyHandler({'http': self.proxy_addr})
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
        return data  # 返回的是字符串

    def get_container_id(self, url):
        data = self.use_proxy(url)
        # 通过json从获取的网页数据中,找到data
        content = json.loads(data).get('data')
        # 从data中找到container_id
        for data in content.get('tabsInfo').get('tabs'):
            if data.get('tab_type') == 'weibo':
                container_id = data.get('containerid')
                return container_id

    def get_user_info(self):
        data = self.use_proxy(self.url)
        # 通过json从获取的网页数据中,找到data
        content = json.loads(data).get('data')
        # 从data数据中获取下列信息
        profile_image_url = content.get('userInfo').get('profile_image_url')
        description = content.get('userInfo').get('description')
        profile_url = content.get('userInfo').get('profile_url')
        verified = content.get('userInfo').get('verified')
        guanzhu = content.get('userInfo').get('follow_count')
        name = content.get('userInfo').get('screen_name')
        fensi = content.get('userInfo').get('followers_count')
        gender = content.get('userInfo').get('gender')
        urank = content.get('userInfo').get('urank')
        print(f"微博昵称:{name}\n微博主页地址:{profile_url}\n微博头像地址:{profile_image_url}\n"
              f"是否认证:{verified}\n微博说明:{description}\n关注人数:{guanzhu}"
              f"粉丝数:{fensi}\n性别:{gender}\n微博等级:{urank}\n")

    def save_img(self, mblog):
        if mblog.get('pics'):
            pic_archive = mblog.get('pics')
            for _ in range(len(pic_archive)):
                self.pic_num += 1
                print(pic_archive[_]['large']['url'])
                img_url = pic_archive[_]['large']['url']
                img_data = requests.get(img_url)
                path = f"{self.path}/{self.user_id}/{self.user_id}_{self.pic_num}{img_url[-4:]}"
                with open(path, "wb") as f:
                    f.write(img_data.content)

    def save_data(self, j, mblog, cards):
        path = f"{self.path}/{self.user_id}/{self.user_id}.txt"
        attitudes_count = mblog.get('attitudes_count')
        comments_count = mblog.get('comments_count')
        created_at = mblog.get('created_at')
        reposts_count = mblog.get('reposts_count')
        scheme = cards[j].get('scheme')
        text = mblog.get('text')
        with open(path, 'a', encoding='utf-8') as f:
            f.write(f"----第{self.page}页,第{j}条微博----" + "\n")
            f.write(f"微博地址:{str(scheme)}\n发布时间:{str(created_at)}\n微博内容:{text}\n"
                    f"点赞数:{attitudes_count}\n评论数:{comments_count}\n转发数:{reposts_count}\n")

    def process_data(self, data):
        content = json.loads(data).get('data')
        cards = content.get('cards')
        if len(cards) > 0:
            for j in range(len(cards)):
                print(f"-----正在爬取第{self.page}页,第{j + 1}条微博------")
                card_type = cards[j].get('card_type')
                if card_type == 9:
                    mblog = cards[j].get('mblog')
                    self.save_img(mblog)
                    self.save_data(j, mblog, cards)
            self.page += 1
        else:
            self.flag = False

    def run(self):
        while True:
            weibo_url = f"{self.url}&containerid={self.container_id}&page={self.page}"
            try:
                data = self.use_proxy(weibo_url)
                self.process_data(data)
                if not self.flag:
                    break

            except Exception as err:
                print(err)


if __name__ == '__main__':  
    # 1669879400   # 热巴
    uid = "1669879400"
    wb = WeiboUserCrawler(uid)
    wb.run()

你可能感兴趣的:(python,爬虫,数据库)