如何模拟用户登录爬取知乎

**

如何模拟用户登录爬取知乎

**


import requests

# 可以读取本地的cookie送给requests
try:
    import cookielib                   # Python2中叫cookielib
except:
    import http.cookiejar as cookielib       # Python3中叫做cookiejar

import re

session = requests.sessions() # requests用session保存
session.cookies = cookielib.LWPCookieJar(filename="cookies.text")         # 这个类实例出来的cookie就可以使用save方法

try:
    session.cookies.load(ignore_discard=True)
except:
    print("cookie未能加载")




agent = "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" # 设置代理用户
# 头节点信息
header = {
    "HOST":"www.zhihu.com",
    "Referer": "https://www.zhihu.com",
    'User-Agent': agent
}

def is_login():
    # 通过个人中心页面返回状态码判断是否为登录状态
    inbox_url = "https://www.zhihu.com/index"
    response = session.get(inbox_url, headers=header, allow_redirects=False)
    if response.status_code != 200:
        return False
    else:
        return True


def get_xsrf():
    # 获取xsrf code
    response = session.get("https://www.zhihu.com", headers = header)
    # print (response.text)

    # text = ''
    match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text)
    if match_obj:
        return (match_obj.group(1))
    else:
        return ""


def get_index():
    response = session.get("https://www.zhihu.com", headers=header)
    with open("index_page.html", "wb") as f:
        f.write(response.text.encode("utf-8"))
    print("ok")


def zhihu_login(account, password):
    # 知乎登录
    if re.match("^1\d{10}", account):    #匹配手机号
        print ("手机号码登录")
        post_url = "https://www.zhihu.com/login/phone_num"    # 手机号码登录的url
        post_data = {
            "_xsrf": get_xsrf(),
            "phone_num": account,
            "password": password
        }

    else:
        if "@" in account:
            # 判断用户名是否为邮箱
            print("邮箱登录")
            post_url = "https://www.zhihu.com/login/email"  # 手机号码登录的url
            post_data = {
                "_xsrf": get_xsrf(),
                "email": account,
                "password": password
            }
    response_text = session.post(post_url, data=post_data, headers=header)  # 模拟请求

    session.cookies.save()  # session保存到本地\

get_xsrf()

你可能感兴趣的:(Python,Scrapy,如何模拟用户登录爬取知乎网站)