import requests
from pyquery import PyQuery as pq
url='http://www.zhihu.com/explore'
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
html=requests.get(url,headers=headers).text
doc=pq(html)
items=doc('.explore-tab .feed-item').items()
for item in items:
question=item.find('h2').text()
author=item.find('.author-link-line').text()
answer=item.find('.content').text()
file=open('explore.txt','a',encoding='utf-8')
file.write('\n'.join([question,author,answer]))
file.write('\n'+'='*50+'\n')
file.close()
import urllib.request
import json
id='1320135280'
proxy_addr="122.241.72.191:808"
def user_proxy(url,proxy_addr):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
proxy=urllib.request.ProxyHandler({'http':proxy_addr})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data=urllib.request.urlopen(req).read().decode('utf-8')
return data
def get_containerid(url):
data=user_proxy(url,proxy_addr)
print(data)
content=json.loads(data).get('data')
for tag in content.get('tabsInfo').get('tabs'):
if tag.get('tab_type')=='weibo':
containerid=tag.get('containerid')
return containerid
def get_user_info(id):
url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + id
data=user_proxy(url,proxy_addr)
print(data)
content=json.loads(data).get('data')
user={}
user['id']=content.get('userInfo').get('id')
user['statuses_count']=content.get('userInfo').get('statuses_count')
user['gender']=content.get('userInfo').get('gender')
user['followers_count']=content.get('userInfo').get('follower_count')
user['follow_count']=content.get('userInfo').get('follow_count')
user['profile_url']=content.get('userInfo').get('profile_url')
yield user
def get_weibo(id,file):
i=1
while True:
url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id
weibo_url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url)+'&page='+str(i)
try:
data=user_proxy(weibo_url,proxy_addr)
content=json.loads(data).get('data')
cards=content.get('cards')
if(len(cards)>0):
for j in range(len(cards)):
print("-----正在爬取第"+str(i)+"页,第"+str(j)+"条微博------")
card_type=cards[j].get('card_type')
if(card_type==9):
mblog=cards[j].get('mblog')
attitudes_count=mblog.get('attitudes_count')
comments_count=mblog.get('comments_count')
created_at=mblog.get('created_at')
reposts_count=mblog.get('reposts_count')
scheme=cards[j].get('scheme')
text=mblog.get('text')
with open(file,'a',encoding='utf-8') as fh:
fh.write("----第"+str(i)+"页,第"+str(j)+"条微博----"+"\n")
fh.write("微博地址:"+str(scheme)+"\n"+"发布时间:"+str(created_at)+"\n"+"微博内容:"+text+"\n"+"点赞数:"+str(attitudes_count)+"\n"+"评论数:"+str(comments_count)+"\n"+"转发数:"+str(reposts_count)+"\n")
i+=1
else:
break
except Exception as e:
print(e)
pass
url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id
get_user_info(id)
get_containerid(url)
get_weibo(id,'mayun.txt')
import requests
import json
from urllib.parse import urlencode
from pymongo import MongoClient
from pyquery import PyQuery as pq
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2145291155',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
client=MongoClient()
db=client['weibo']
collection=db['mayun']
max_page=14
def get_page(page):
params={
'type':'uid',
'value': '2145291155',
'containerid': '1076032145291155',
'page': page,
}
url=base_url+urlencode(params)
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.json()
except requests.ConnectionError as e:
print('error',e.args)
def parse_page(jsonstr):
if jsonstr:
it = jsonstr.get('data')
items=it['cards']
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def save_to_mongo(result):
if collection.insert(result):
print('save to mongo')
if __name__=='__main__':
for page in range(1,max_page+1):
jsonstr=get_page(page)
results=parse_page(jsonstr)
for result in results:
save_to_mongo(result)
print(result)