知乎爬虫

这个刚开始可以爬,后来发现不能登录了。

import urllib.parse,urllib.request,http.cookiejar,re

cookie = http.cookiejar.CookieJar() 
cookieProc = urllib.request.HTTPCookieProcessor(cookie) 
opener = urllib.request.build_opener(cookieProc) 
urllib.request.install_opener(opener) 
h = urllib.request.urlopen('https://www.zhihu.com/login/phone_num').read().decode('utf_8') 
patten1 = re.compile(r'name="_xsrf" value="(.*?)"')
b1 = patten1.search(h)

# patten2 = re.compile(r'name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)"')
# b2 = patten2.search(h)


headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',  
        'Referer' : 'https://www.zhihu.com/login/phone_num'}  
postData = {
        '_xsrf':b1.group(1),
    'password':'密码',
    'captcha_type':'cn',
    'phone_num':'手机号',
    }  

postData =data = urllib.parse.urlencode(postData).encode(encoding='UTF8')# urllib.parse.urlencode(postData)
request = urllib.request.Request(url = 'https://www.zhihu.com/login/phone_num',data = postData,headers = headers)  
response = urllib.request.urlopen(request)
text = response.read().decode("utf8")  

url='https://www.zhihu.com'
req=urllib.request.Request(url=url,headers=headers)  
result=urllib.request.urlopen(req).read().decode("UTF8")  

print (text)

print(result)

你可能感兴趣的:(知乎爬虫)