python-关于爬虫爬取36kr数据

import json

import requests
import re

class Kr36(object):
    def __init__(self):
        # 利用首页
        self.url = 'https://36kr.com/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }
        # ajax发送请求下一页
        self.ajax_url='https://36kr.com/api/search-column/mainsite?per_page=20&page={}'
        self.file = open('kr36.json','w')
        self.page = 1
    def get_data(self,url):
        resp = requests.get(url,headers=self.headers)
        print(resp)
        return resp.content.decode()
    # 解析数据
    def parse_data(self,data):
        # 利用正则从标签中提取需要的数据
        result = re.findall('',data)[0]
        # 不全是json数据,需要分割数据
        result = result.split(',locationnal={"ur')[0]
        # 判断数据可以先写入到文件中查看
        # with open('kr36.json','w') as f:
        #     f.write(result)
        json_dict = json.loads(result)
        news_list = json_dict['feedPostsLatest|post']
        data_list = []
        for news in news_list:
            temp={}
            temp['title'] = news['title']
            temp['summary']=news['summary']
            temp['cover']=news['cover']
            data_list.append(temp)
        return data_list

    def save_data(self,data_list):
        for data in data_list:
            data_dict = json.dumps(data,ensure_ascii=False) + ',\n'
            self.file.write(data_dict)
    # 利用ajax发送下一页数据请求
    def parse_ajax_data(self,data):
        data_dict = json.loads(data)
        news_list=data_dict['data']['items']
        data_list = []
        for news in news_list:
            temp = {}
            temp['title'] = news['title']
            temp['summary'] = news['summary']
            temp['cover'] = news['cover']
            data_list.append(temp)
        return data_list
    # 关闭文件写入
    def __del__(self):
        self.file.close()

    def run(self):
        url = self.url
        data = self.get_data(url)
        data_list= self.parse_data(data)
        self.save_data(data_list)
        # 循环加载下一页数据
        while True:
            url=self.ajax_url.format(self.page)
            data=self.get_data(url)
            ajax_data_list = self.parse_ajax_data(data)
            self.save_data(ajax_data_list)
            if ajax_data_list==[]:
                break
            self.page+=1

if __name__ == '__main__':
    kr36 = Kr36()
    kr36.run()

你可能感兴趣的:(python,webspider)