第五章 知乎问题爬取

爬取知乎问答

标签(空格分隔): python scrapy session cookie


session和cookie的区别

  • cookie

    cookie是浏览器在本地对用户的信息进行一个key和value的存储,存在严重的安全隐患

  • session

    session是存在服务器上,并给用户进行一个id分配,当用户请求的时候,根据id再将用户信息发给浏览器,有个过期时间。

知乎模拟登陆

  • httpcode 请求状态码
code 状态
200 请求被成功返回
301/302 永久性重定向/临时性重定向
403 没有访问权限
404 表示没有对应的资源
500 服务器错误
503 服务器停机或正在维护
  • 模拟登陆知乎
    原视频录制时,反爬虫机制与现在的不一致,此时的模拟登陆需要,添加一个验证码参数的模拟登陆。
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
"""
 @author 金全 JQ
 @version 1.0 , 2017/10/25
 @description 模拟知乎登陆
"""

import requests
import re

try:
    import cookielib
except:
    import http.cookiejar as cookielib

import time
try:
    from PIL import Image
except:
    pass
import os

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")

try:
    session.cookies.load(ignore_discard=True)
except:
    print("cookie加载异常")

agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36"
header = {
    "HOST":"www.zhihu.com",
    "Referer":"https://www.zhihu.com",
    "User-Agent":agent
}


#获取xsrf
def get_xsrf():
    response = session.get("https://www.zhihu.com",headers= header)
    print(response.text)
    match_obj = re.findall(r'name="_xsrf" value="(.*?)"', response.text)
    if match_obj:
        return match_obj[0]
    else:
        return ""


def is_login():
    #通过个人中心判断是否为登陆状态
    inbox_url = 'http://www.zhihu.com/inbox'
    response = session.get(inbox_url,headers= header,allow_redirects=False)
    if response.status_code !=200:
        return False
    else:
        return True



def get_captcha():
    # 获取验证码
    t = str(int(time.time()*1000))
    captcha_url = 'http://www.zhihu.com/captcha.gif?r='+t+"&type=login"
    r = session.get(captcha_url,headers = header)
    with open('captcha.jpg','wb') as f :
        f.write(r.content)
        f.close()
    try:
        im = Image.open('captcha.jpg')
        im.show()
        im.close()
    except:
        print(u'请到%s目录下找到captch.jpg 手动输入' %os.path.abspath('captcha.jpg'))
    captcha = input('capture:')
    return captcha


def get_index():
    response = session.get("https://www.zhihu.com", headers=header)
    with open("page_index.html","wb") as f:
        f.write(response.text.encode("utf-8"))
    print("ok")


def zhihu_login(account,password):
    # 知乎手机号登陆
    match_phone = re.match("^1\d{10}$",account)
    if match_phone:
        print("手机号登陆")
        post_number = "https://www.zhihu.com/login/phone_num"
        post_data = {
            "_xsrf": get_xsrf(),
            "phone_num": account,
            "captcha":get_captcha(),
            "password":password
        }
    else:
        if "@" in account:
            print("邮箱登陆")
            post_number = "https://www.zhihu.com/login/email"
            post_data = {
                "_xsrf": get_xsrf(),
                "email": account,
                "captcha": get_captcha(),
                "password": password
            }

    response_text = session.post(post_number, data=post_data, headers=header)
    session.cookies.save()




# zhihu_login("${username}","${password}")
# get_index()
  • scrapy 实现知乎登陆
# -*- coding: utf-8 -*-
import scrapy
import time
try:
    from PIL import Image
except:
    pass

import json
import os

try:
    import urlparse as parse
except:
    from urllib import parse

class ZhihuLoginSpider(scrapy.Spider):
    name = 'zhihu_login'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    Agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    header = {
        'User-Agent': Agent,
    }

    def parse(self, response):
        # 主页爬取的具体内容
        all_urls = response.css('a::attr(href)').extract()
        all_urls = [parse.urljoin(response.url, url) for url in all_urls]
        # 去除javascript
        all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
        for url in all_urls:
            pass

    def start_requests(self):
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
        return [scrapy.Request(url=captcha_url, headers=self.header, callback=self.parser_captcha)]

    def parser_captcha(self, response):
        with open('captcha.jpg', 'wb') as f:
            f.write(response.body)
            f.close()
        try:
            im = Image.open('captcha.jpg')
            im.show()
            im.close()
        except:
            print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
        captcha = input("please input the captcha\n>")
        return scrapy.FormRequest(url='https://www.zhihu.com/#signin', headers=self.header, callback=self.login, meta={
            'captcha': captcha
        })

    def login(self, response):
        xsrf = response.xpath("//input[@name='_xsrf']/@value").extract_first()
        if xsrf is None:
            return ''
        post_url = 'https://www.zhihu.com/login/phone_num'
        post_data = {
            "_xsrf": xsrf,
            "phone_num": '${username}',
            "password": '${password}',
            "captcha": response.meta['captcha']
        }
        return [scrapy.FormRequest(url=post_url, formdata=post_data, headers=self.header, callback=self.check_login)]

    # 验证返回是否成功
    def check_login(self, response):
        js = json.loads(response.text)
        if 'msg' in js and js['msg'] == '登录成功':
            for url in self.start_urls:
                yield scrapy.Request(url=url, headers=self.header, dont_filter=True)

知乎问题以及回答内容提取

  • scrapy shell 使用
scrapy shell -s USER_AGENT= "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36" 链接
  • 爬虫逻辑

    scrapy 是基于twisted框架的所以为深度优先

    知乎模拟登陆(目的:用于获取更多用户数据):以start_request为入口 获取验证码,回掉图片写入和读取操作,接着进行知乎登陆,通过FromDataRequest带上请求登陆参数,登陆。

    知乎问题爬取:获取到的页面进行所有url获取,进行https匹配,将其中不是请求的数据进行剔除,接着,正则出是问题的url,对提取出的url进行便利(如果有新的url在回掉本函数),对循环的url进行请求获取内容并解析,接着通过知乎的用户回答api进行用户回复api数据读取并解析。

 def parse(self, response):
        # 主页爬取的具体内容
        all_urls = response.css('a::attr(href)').extract()
        all_urls = [parse.urljoin(response.url, url) for url in all_urls]
        # 去除javascript
        all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
        for url in all_urls:
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",url)
            if match_obj:
                request_url = match_obj.group(1)
                question_id = match_obj.group(2)
                yield scrapy.Request(url=request_url,headers=self.header,meta={'question_id':question_id},callback=self.question_detail)
            else:
                yield scrapy.Request(url=url,headers=self.header,callback=self.parse)

    def question_detail(self,response):
        # 页面内容提取
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        if "QuestionHeader-title" in response.text:
            # 新版本内容提取
            item_loader.add_value('url',response.url)
            item_loader.add_value('zhihu_id',response.meta['question_id'])
            item_loader.add_css('title','.QuestionHeader-title::text')
            item_loader.add_css('content','.QuestionHeader-detail')
            item_loader.add_css('answer_num','.List-headerText span::text')
            item_loader.add_css('comments_num','.QuestionHeader-Comment button::text')
            item_loader.add_css('watch_user_num','.NumberBoard-value::text')
            item_loader.add_css('topics','.QuestionHeader-tags .Popover div::text')
        else:
            # 旧版本的处理
            item_loader.add_value('url', response.url)
            item_loader.add_value('zhihu_id', response.meta['question_id'])
            item_loader.add_css('title', '.zh-question-title h2 a::text')
            item_loader.add_css('content', '#zh-question-detail')
            item_loader.add_css('answer_num', '#zh-question-answer-num::text')
            item_loader.add_css('comments_num', '#zh-question-meta-wrap a[name="addcomment"]::text')
            item_loader.add_css('watch_user_num', '#zh-question-side-header-wrap::text')
            item_loader.add_css('topics', '.zm-tag-editor-labels a::text')
        question_item = item_loader.load_item()
        yield scrapy.Request(url=self.start_answer_url.format(response.meta['question_id'],20,0),headers=self.header,callback=self.parse_answer)
        yield question_item



    def parse_answer(self,response):
        # 处理问题的回答
        answer_item = ZhihuAnswerItem()
        answer_json = json.loads(response.text)
        is_end = answer_json['paging']['is_end']
        next_url = answer_json['paging']['next']

        for answer in answer_json['data']:
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author']['id'] if "id" in answer['author'] else None
            answer_item['content'] = answer['content'] if "content" in answer else None
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = datetime.datetime.now()
            yield answer_item

        if not is_end:
            yield scrapy.Request(url=next_url,headers=self.header,callback=self.parse_answer)

知乎问题及回答入库:采用在item中添加方法的形式,其中方法中添加sql语句和参数,插入数据同时需要主键遍历mysql特殊语法,同时对参数进行处理返回,之后通过teisted进行数据的入库。

class ZhihuQuestionItem(scrapy.Item):
    # 知乎问题字段
    zhihu_id = scrapy.Field()
    topics = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
    answer_num = scrapy.Field()
    comments_num = scrapy.Field()
    watch_user_num = scrapy.Field()
    click_num = scrapy.Field()
    crawl_time = scrapy.Field()

    def get_insert_sql(self):
        insert_sql="""
            insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time) 
            values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        """
        zhihu_id = ''.join(self['zhihu_id'])
        topics = ','.join(self['topics'])
        url = ''.join(self['url'])
        title = ''.join(self['title'])
        content = ''.join(self['content'])
        answer_num = common.regex_match(''.join(self['answer_num']))
        comments_num = common.regex_match(''.join(self['comments_num']))
        watch_user_num = self['watch_user_num'][0]
        click_num = self['watch_user_num'][1]
        crawl_time = datetime.datetime.now().strftime(MYSQL_DATETIEM_STRFTIME)
        params = (zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time)
        return insert_sql,params

class ZhihuAnswerItem(scrapy.Item):
    # 知乎问题回答
    zhihu_id = scrapy.Field()
    url = scrapy.Field()
    question_id = scrapy.Field()
    author_id = scrapy.Field()
    content = scrapy.Field()
    praise_num = scrapy.Field()
    comments_num = scrapy.Field()
    create_time = scrapy.Field()
    update_time = scrapy.Field()
    crawl_time = scrapy.Field()
    def get_insert_sql(self):
        insert_sql="""
            insert into zhihu_answer(zhihu_id,url,question_id,author_id,content,praise_num,comments_num,create_time,update_time,crawl_time)
            values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE content = VALUES (content),praise_num = VALUES (praise_num),
            comments_num = VALUES (comments_num),update_time=VALUES (update_time)
        """
        create_time = datetime.datetime.fromtimestamp(self['create_time']).strftime(MYSQL_DATETIEM_STRFTIME)
        update_time = datetime.datetime.fromtimestamp(self['update_time']).strftime(MYSQL_DATETIEM_STRFTIME)
        params = (self['zhihu_id'],self['url'],self['question_id'],
                  self['author_id'],self['content'],self['praise_num'],
                  self['comments_num'],create_time,update_time,
                  self['crawl_time'].strftime(MYSQL_DATETIEM_STRFTIME))
        return insert_sql,params
        
        
        
# 异步方式进行数据插入
class MysqlTwistedPipeline:
    def __init__(self,dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls,settings):
        dbparms = dict(
            host = settings["MYSQL_HOST"],
            db = settings["MYSQL_DBNAME"],
            user = settings["MYSQL_USER"],
            passwd = settings["MYSQL_PASSWORD"],
            charset = 'utf8',
            cursorclass = MySQLdb.cursors.DictCursor,
            use_unicode = True,
        )
        dbpool =  adbapi.ConnectionPool("MySQLdb",**dbparms)
        return cls(dbpool)

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.insert_sql,item)
        query.addErrback(self.handle_error,item,spider) # 处理异常

    def handle_error(self,failure,item,spider):
        print(failure)

    def insert_sql(self,cursor,item):
        insert_sql,params = item.get_insert_sql()
        cursor.execute(insert_sql, params

  • 原视频UP主慕课网(聚焦Python分布式爬虫必学框架Scrapy 打造搜索引擎)
  • 本篇博客撰写人: XiaoJinZi 个人主页 转载请注明出处
  • 学生能力有限 附上邮箱: [email protected] 不足以及误处请大佬指责

你可能感兴趣的:(第五章 知乎问题爬取)