知乎2

import requests,re,json,os,random,time
from bs4 import BeautifulSoup
def header():
headers = [
{"User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)"},
{"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"},
{"User-Agent": "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)"},
{"User-Agent": "SAMSUNG-SGH-I617/UCHJ1 Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 7.11)"},
{"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 8.12; MSIEMobile 6.0) 320x240; VZW; UTStar-XV6175.1; Windows Mobile 6.5 Standard;"},
{"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"},
{"User-Agent": "Opera/9.80 (BREW; Opera Mini/5.0/27.2370; U; en) Presto/2.8.119 240X320 Samsung SCH-U380"},


]
return random.choice(headers)
s=requests.Session()
def get_column(column):
num=column['followers']//1000+1
column_followers_urls=['https://zhuanlan.zhihu.com/api/columns/%s/followers?limit=1000&offset=%d000'%(column['id'],i) for i in range(1,num)]
column_followers_urls.insert(0,'https://zhuanlan.zhihu.com/api/columns/%s/followers?limit=1000'%column['id'])
for column_followers_url in column_followers_urls:
time.sleep(6)
try:
followers_info=s.get(column_followers_url,headers=header()).content.decode('utf-8')
followers_info=json.loads(followers_info)
for people in followers_info:
with open('d://zhihu//%s//followers_info.txt'%column['id'],'a',errors='replace')as f:
f.write(str(people['slug'])+','+str(people['name'])+','+str(people['bio'])+'\n')
except:
continue
column_url='https://zhuanlan.zhihu.com/api/columns/%s/posts?limit=100'%column['id']
c=s.get(column_url,headers=header()).content.decode('utf-8')
articles=json.loads(c)
for article in articles:
print(article ['title'])
print(article ['slug'])
print(article ['publishedTime'])
print(article ['likesCount'])
print(article ['commentsCount'])
print(article ['author']['name'])
a=article ['title']+'赞'+str(article ['likesCount'])+'作者'+article ['author']['name']+'时间'+str(article ['publishedTime'])+'评论'+str(article ['commentsCount'])+'链接'+str(article ['slug'])
a=re.sub(r'\/|\\|\*|\>|\<|\?|\:|\"|\|','',a)
art_url='https://zhuanlan.zhihu.com/p/%s?refer=%s'%(article ['slug'],column['id'])
bs1=BeautifulSoup(article['content'],'lxml').text
with open('d://zhihu//%s//%s.txt'%(column['id'],a),'w',errors='replace') as f:
f.write(a+'\n'+art_url+'\n'+bs1)


columns_dir=[]
for filename in os.listdir(r'd://zhihu'):
columns_dir.append(filename)


for column_dir in columns_dir:
with open('d://zhihu//%s//column_info.txt'%column_dir,'r') as colu:
column_dict=eval(colu.read())
try:
get_column(column_dict)
except:
continue

你可能感兴趣的:(知乎2)