python 微博爬虫之关键词检索

首先安装必要的库:

pip install pyquery
pip install requests
pip install pymongo

具体代码如下:

import requests
from pyquery import PyQuery as pq
import time
from pymongo import MongoClient
from urllib.parse import quote


headers = {
    'Host': 'm.weibo.cn',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}


m = input('你想查找的内容:')#控制检索关键词

n = int(input('你想查找多少页:')#控制检索页码


def get_page(page): 

    url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D'+quote(m)+'&page_type=searchall&page='+str(page)#将你检索内容转为超链接
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(page)
            return response.json()
    except requests.ConnectionError as e:
        print('Error', e.args)

def parse_page(json):
    if json:
        items = json.get('data').get('cards')
        for i in items:
            groups = i.get('card_group')
            if groups ==None:
                continue
            for item in groups:
                item = item.get('mblog')
                if item == None:
                    continue

                weibo = {}
                weibo['id'] = item.get('id')
                weibo['text'] = pq(item.get('text')).text()
                weibo['name'] = item.get('user').get('screen_name')
                if item.get('longText') != None :#要注意微博分长文本与文本,较长的文本在文本中会显示不全,故我们要判断并抓取。
                    weibo['longText'] = item.get('longText').get('longTextContent')
                else:
                    weibo['longText'] =None
                print(weibo['name'])
                print(weibo['text'])
                if weibo['longText'] !=None:
                    print('longText(全文)##################')
                    print(weibo['longText']) #判断长文本是不是None 如果是None 就不输出.
                weibo['attitudes'] = item.get('attitudes_count')
                weibo['comments'] = item.get('comments_count')
                weibo['reposts'] = item.get('reposts_count')
                weibo['time'] = item.get('created_at')
                
                yield weibo
if __name__ == '__main__':
    client = MongoClient()    #连接mongodb
    db = client['weibo_km']    #建立数据库
    collection = db['weibo_km']#建立表


    def save_to_mongo(result):#存入数据库
        if collection.insert(result):
            print('Saved to Mongo')

    for page in range(1,n+1):#循环页面
        time.sleep(1)         #设置睡眠时间,防止被封号
        json = get_page(page)
        results = parse_page(json)

        for result in results:
            print(result['time'])#输出文章发布时间
            save_to_mongo(result)#判断是否存入数据库

之后可以对爬取的数据进行处理.

你可能感兴趣的:(爬虫,微博爬虫)