爬虫——获取知乎热点新闻

Python 2.7 按照程序提示 输入账号密码之后 可以 获得到知乎热点新闻的标题链接。如果想获得知乎其他信息可以自行修改。

直接上代码啦


import re
import requests
import cookielib
from PIL import Image
import time
import json
import webbrowser
from attr import attrib
from lxml import etree
import urllib2
import urlparse
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
filename = 'cookie'
session = requests.Session()
session.cookies = cookielib.CookieJar()
try:
session.cookies.load(filename=filename, ignore_discard=True)
except:
print('cookie fail')
#
def get_xsrf():
response = session.get('https://www.zhihu.com', headers=headers)
html = response.text
get_xsrf_pattern = re.compile(r' _xsrf = re.findall(get_xsrf_pattern, html)[0]
return _xsrf
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
response = session.get(captcha_url, headers=headers)
with open('cptcha.gif', 'wb') as f:
f.write(response.content)
im = Image.open('cptcha.gif')
im.show()
captcha = raw_input('Verification code:')
print captcha
return captcha
def login(username, password):
if re.match(r'\d{11}$', account):
print('phone logining')
url = 'http://www.zhihu.com/login/phone_num'
data = {'_xsrf': get_xsrf(),
'password': password,
'remember_me': 'true',
'phone_num': username
}
else:
print('email longing')
url = 'https://www.zhihu.com/login/email'
data = {'_xsrf': get_xsrf(),
'password': password,
'remember_me': 'true',
'email': username
}
data['captcha'] = get_captcha()
result = session.post(url, data=data, headers=headers)
print((json.loads(result.text))['msg']+' codeLogin')
# session.cookies.save(ignore_discard=True, ignore_expires=True)
def nextMore(offset, start):
url = 'https://www.zhihu.com/node/TopStory2FeedList'
data = {'params': {'offset':offset, 'start':start},
'method': 'next'
}
result = session.post(url, data=data, headers=headers)
print((json.loads(result.text))['msg'] + ' ')
def download(url, headers, proxy, num_retries, data=None):
headers = headers or {}
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return download(url, headers, proxy, num_retries - 1, data)
else:
code = None
return html
if name == 'main':
account = raw_input('account:')
secret = raw_input('password:')
login(account, secret)
get_url = 'https://www.zhihu.com/explore/recommendations'
resp = session.get(get_url, headers=headers, allow_redirects=False)
page = etree.HTML(resp.text)
i = 1
while (i<6):
string = "//div[@id='zh-recommend']/div[2]/div[1]/div[" + str(i) + "]/h2/a"
hrefs = page.xpath(string)
for href in hrefs:
print href.text + '\n' + 'https://www.zhihu.com' + str(href.attrib['href'])
url = 'https://www.zhihu.com' + str(href.attrib['href'])
i = i + 1
webbrowser.open(get_url, new=0, autoraise=True)

By 戴眼镜的莫林

你可能感兴趣的:(爬虫——获取知乎热点新闻)