import json
import requests
import re
class Kr36(object):
def __init__(self):
# 利用首页
self.url = 'https://36kr.com/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
# ajax发送请求下一页
self.ajax_url='https://36kr.com/api/search-column/mainsite?per_page=20&page={}'
self.file = open('kr36.json','w')
self.page = 1
def get_data(self,url):
resp = requests.get(url,headers=self.headers)
print(resp)
return resp.content.decode()
# 解析数据
def parse_data(self,data):
# 利用正则从标签中提取需要的数据
result = re.findall('',data)[0]
# 不全是json数据,需要分割数据
result = result.split(',locationnal={"ur')[0]
# 判断数据可以先写入到文件中查看
# with open('kr36.json','w') as f:
# f.write(result)
json_dict = json.loads(result)
news_list = json_dict['feedPostsLatest|post']
data_list = []
for news in news_list:
temp={}
temp['title'] = news['title']
temp['summary']=news['summary']
temp['cover']=news['cover']
data_list.append(temp)
return data_list
def save_data(self,data_list):
for data in data_list:
data_dict = json.dumps(data,ensure_ascii=False) + ',\n'
self.file.write(data_dict)
# 利用ajax发送下一页数据请求
def parse_ajax_data(self,data):
data_dict = json.loads(data)
news_list=data_dict['data']['items']
data_list = []
for news in news_list:
temp = {}
temp['title'] = news['title']
temp['summary'] = news['summary']
temp['cover'] = news['cover']
data_list.append(temp)
return data_list
# 关闭文件写入
def __del__(self):
self.file.close()
def run(self):
url = self.url
data = self.get_data(url)
data_list= self.parse_data(data)
self.save_data(data_list)
# 循环加载下一页数据
while True:
url=self.ajax_url.format(self.page)
data=self.get_data(url)
ajax_data_list = self.parse_ajax_data(data)
self.save_data(ajax_data_list)
if ajax_data_list==[]:
break
self.page+=1
if __name__ == '__main__':
kr36 = Kr36()
kr36.run()