爬取知乎问答
标签(空格分隔): python scrapy session cookie
session和cookie的区别
-
cookie
cookie是浏览器在本地对用户的信息进行一个key和value的存储,存在严重的安全隐患
-
session
session是存在服务器上,并给用户进行一个id分配,当用户请求的时候,根据id再将用户信息发给浏览器,有个过期时间。
知乎模拟登陆
- httpcode 请求状态码
code | 状态 |
---|---|
200 | 请求被成功返回 |
301/302 | 永久性重定向/临时性重定向 |
403 | 没有访问权限 |
404 | 表示没有对应的资源 |
500 | 服务器错误 |
503 | 服务器停机或正在维护 |
- 模拟登陆知乎
原视频录制时,反爬虫机制与现在的不一致,此时的模拟登陆需要,添加一个验证码参数的模拟登陆。
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
"""
@author 金全 JQ
@version 1.0 , 2017/10/25
@description 模拟知乎登陆
"""
import requests
import re
try:
import cookielib
except:
import http.cookiejar as cookielib
import time
try:
from PIL import Image
except:
pass
import os
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
try:
session.cookies.load(ignore_discard=True)
except:
print("cookie加载异常")
agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36"
header = {
"HOST":"www.zhihu.com",
"Referer":"https://www.zhihu.com",
"User-Agent":agent
}
#获取xsrf
def get_xsrf():
response = session.get("https://www.zhihu.com",headers= header)
print(response.text)
match_obj = re.findall(r'name="_xsrf" value="(.*?)"', response.text)
if match_obj:
return match_obj[0]
else:
return ""
def is_login():
#通过个人中心判断是否为登陆状态
inbox_url = 'http://www.zhihu.com/inbox'
response = session.get(inbox_url,headers= header,allow_redirects=False)
if response.status_code !=200:
return False
else:
return True
def get_captcha():
# 获取验证码
t = str(int(time.time()*1000))
captcha_url = 'http://www.zhihu.com/captcha.gif?r='+t+"&type=login"
r = session.get(captcha_url,headers = header)
with open('captcha.jpg','wb') as f :
f.write(r.content)
f.close()
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到%s目录下找到captch.jpg 手动输入' %os.path.abspath('captcha.jpg'))
captcha = input('capture:')
return captcha
def get_index():
response = session.get("https://www.zhihu.com", headers=header)
with open("page_index.html","wb") as f:
f.write(response.text.encode("utf-8"))
print("ok")
def zhihu_login(account,password):
# 知乎手机号登陆
match_phone = re.match("^1\d{10}$",account)
if match_phone:
print("手机号登陆")
post_number = "https://www.zhihu.com/login/phone_num"
post_data = {
"_xsrf": get_xsrf(),
"phone_num": account,
"captcha":get_captcha(),
"password":password
}
else:
if "@" in account:
print("邮箱登陆")
post_number = "https://www.zhihu.com/login/email"
post_data = {
"_xsrf": get_xsrf(),
"email": account,
"captcha": get_captcha(),
"password": password
}
response_text = session.post(post_number, data=post_data, headers=header)
session.cookies.save()
# zhihu_login("${username}","${password}")
# get_index()
- scrapy 实现知乎登陆
# -*- coding: utf-8 -*-
import scrapy
import time
try:
from PIL import Image
except:
pass
import json
import os
try:
import urlparse as parse
except:
from urllib import parse
class ZhihuLoginSpider(scrapy.Spider):
name = 'zhihu_login'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
Agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
header = {
'User-Agent': Agent,
}
def parse(self, response):
# 主页爬取的具体内容
all_urls = response.css('a::attr(href)').extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
# 去除javascript
all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
for url in all_urls:
pass
def start_requests(self):
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
return [scrapy.Request(url=captcha_url, headers=self.header, callback=self.parser_captcha)]
def parser_captcha(self, response):
with open('captcha.jpg', 'wb') as f:
f.write(response.body)
f.close()
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
captcha = input("please input the captcha\n>")
return scrapy.FormRequest(url='https://www.zhihu.com/#signin', headers=self.header, callback=self.login, meta={
'captcha': captcha
})
def login(self, response):
xsrf = response.xpath("//input[@name='_xsrf']/@value").extract_first()
if xsrf is None:
return ''
post_url = 'https://www.zhihu.com/login/phone_num'
post_data = {
"_xsrf": xsrf,
"phone_num": '${username}',
"password": '${password}',
"captcha": response.meta['captcha']
}
return [scrapy.FormRequest(url=post_url, formdata=post_data, headers=self.header, callback=self.check_login)]
# 验证返回是否成功
def check_login(self, response):
js = json.loads(response.text)
if 'msg' in js and js['msg'] == '登录成功':
for url in self.start_urls:
yield scrapy.Request(url=url, headers=self.header, dont_filter=True)
知乎问题以及回答内容提取
- scrapy shell 使用
scrapy shell -s USER_AGENT= "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36" 链接
-
爬虫逻辑
scrapy 是基于twisted框架的所以为深度优先
知乎模拟登陆(目的:用于获取更多用户数据):以start_request为入口 获取验证码,回掉图片写入和读取操作,接着进行知乎登陆,通过FromDataRequest带上请求登陆参数,登陆。
知乎问题爬取:获取到的页面进行所有url获取,进行https匹配,将其中不是请求的数据进行剔除,接着,正则出是问题的url,对提取出的url进行便利(如果有新的url在回掉本函数),对循环的url进行请求获取内容并解析,接着通过知乎的用户回答api进行用户回复api数据读取并解析。
def parse(self, response):
# 主页爬取的具体内容
all_urls = response.css('a::attr(href)').extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
# 去除javascript
all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
for url in all_urls:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",url)
if match_obj:
request_url = match_obj.group(1)
question_id = match_obj.group(2)
yield scrapy.Request(url=request_url,headers=self.header,meta={'question_id':question_id},callback=self.question_detail)
else:
yield scrapy.Request(url=url,headers=self.header,callback=self.parse)
def question_detail(self,response):
# 页面内容提取
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
if "QuestionHeader-title" in response.text:
# 新版本内容提取
item_loader.add_value('url',response.url)
item_loader.add_value('zhihu_id',response.meta['question_id'])
item_loader.add_css('title','.QuestionHeader-title::text')
item_loader.add_css('content','.QuestionHeader-detail')
item_loader.add_css('answer_num','.List-headerText span::text')
item_loader.add_css('comments_num','.QuestionHeader-Comment button::text')
item_loader.add_css('watch_user_num','.NumberBoard-value::text')
item_loader.add_css('topics','.QuestionHeader-tags .Popover div::text')
else:
# 旧版本的处理
item_loader.add_value('url', response.url)
item_loader.add_value('zhihu_id', response.meta['question_id'])
item_loader.add_css('title', '.zh-question-title h2 a::text')
item_loader.add_css('content', '#zh-question-detail')
item_loader.add_css('answer_num', '#zh-question-answer-num::text')
item_loader.add_css('comments_num', '#zh-question-meta-wrap a[name="addcomment"]::text')
item_loader.add_css('watch_user_num', '#zh-question-side-header-wrap::text')
item_loader.add_css('topics', '.zm-tag-editor-labels a::text')
question_item = item_loader.load_item()
yield scrapy.Request(url=self.start_answer_url.format(response.meta['question_id'],20,0),headers=self.header,callback=self.parse_answer)
yield question_item
def parse_answer(self,response):
# 处理问题的回答
answer_item = ZhihuAnswerItem()
answer_json = json.loads(response.text)
is_end = answer_json['paging']['is_end']
next_url = answer_json['paging']['next']
for answer in answer_json['data']:
answer_item['zhihu_id'] = answer['id']
answer_item['url'] = answer['url']
answer_item['question_id'] = answer['question']['id']
answer_item['author_id'] = answer['author']['id'] if "id" in answer['author'] else None
answer_item['content'] = answer['content'] if "content" in answer else None
answer_item['praise_num'] = answer['voteup_count']
answer_item['comments_num'] = answer['comment_count']
answer_item['create_time'] = answer['created_time']
answer_item['update_time'] = answer['updated_time']
answer_item['crawl_time'] = datetime.datetime.now()
yield answer_item
if not is_end:
yield scrapy.Request(url=next_url,headers=self.header,callback=self.parse_answer)
知乎问题及回答入库:采用在item中添加方法的形式,其中方法中添加sql语句和参数,插入数据同时需要主键遍历mysql特殊语法,同时对参数进行处理返回,之后通过teisted进行数据的入库。
class ZhihuQuestionItem(scrapy.Item):
# 知乎问题字段
zhihu_id = scrapy.Field()
topics = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_num = scrapy.Field()
comments_num = scrapy.Field()
watch_user_num = scrapy.Field()
click_num = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
insert_sql="""
insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
zhihu_id = ''.join(self['zhihu_id'])
topics = ','.join(self['topics'])
url = ''.join(self['url'])
title = ''.join(self['title'])
content = ''.join(self['content'])
answer_num = common.regex_match(''.join(self['answer_num']))
comments_num = common.regex_match(''.join(self['comments_num']))
watch_user_num = self['watch_user_num'][0]
click_num = self['watch_user_num'][1]
crawl_time = datetime.datetime.now().strftime(MYSQL_DATETIEM_STRFTIME)
params = (zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time)
return insert_sql,params
class ZhihuAnswerItem(scrapy.Item):
# 知乎问题回答
zhihu_id = scrapy.Field()
url = scrapy.Field()
question_id = scrapy.Field()
author_id = scrapy.Field()
content = scrapy.Field()
praise_num = scrapy.Field()
comments_num = scrapy.Field()
create_time = scrapy.Field()
update_time = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
insert_sql="""
insert into zhihu_answer(zhihu_id,url,question_id,author_id,content,praise_num,comments_num,create_time,update_time,crawl_time)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE content = VALUES (content),praise_num = VALUES (praise_num),
comments_num = VALUES (comments_num),update_time=VALUES (update_time)
"""
create_time = datetime.datetime.fromtimestamp(self['create_time']).strftime(MYSQL_DATETIEM_STRFTIME)
update_time = datetime.datetime.fromtimestamp(self['update_time']).strftime(MYSQL_DATETIEM_STRFTIME)
params = (self['zhihu_id'],self['url'],self['question_id'],
self['author_id'],self['content'],self['praise_num'],
self['comments_num'],create_time,update_time,
self['crawl_time'].strftime(MYSQL_DATETIEM_STRFTIME))
return insert_sql,params
# 异步方式进行数据插入
class MysqlTwistedPipeline:
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.insert_sql,item)
query.addErrback(self.handle_error,item,spider) # 处理异常
def handle_error(self,failure,item,spider):
print(failure)
def insert_sql(self,cursor,item):
insert_sql,params = item.get_insert_sql()
cursor.execute(insert_sql, params
- 原视频UP主慕课网(聚焦Python分布式爬虫必学框架Scrapy 打造搜索引擎)
- 本篇博客撰写人: XiaoJinZi 个人主页 转载请注明出处
- 学生能力有限 附上邮箱: [email protected] 不足以及误处请大佬指责