scrapy startproject douban
scrapy genspider spider douban.com
from scrapy import cmdline
cmdline.execute('scrapy crawl spider'.split())
mport scrapy
from scrapy import Request
import urllib
from PIL import Image
from douban.items import DoubanItem
class SpiderSpider(scrapy.Spider):
name = 'spider'
allowed_domains = ['douban.com']
start_urls = ['https://accounts.douban.com/login']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
item = DoubanItem()
def start_requests(self):
url = self.start_urls[0]
yield Request(url = url,headers=self.headers, callback=self.parse_before_login)
def parse_before_login(self,response):
print("登陆前表单填充")
captcha_id = response.xpath('//*[@id="lzform"]/div[6]/div/div/input[2]/@value').extract()
captcha_image_url = response.xpath('//*[@id="captcha_image"]/@src').extract()
print(type(captcha_image_url))
print(captcha_image_url)
print(captcha_id)
print(None)
if captcha_image_url is None:
print("登录时无验证码")
formdata = {
"source":"index_nva",
"form_email":"[email protected]",
"form_password":"XXXXXXX"
}
return scrapy.FormRequest.from_response(response, headers=self.headers, formdata=formdata,
callback=self.parse_after_login)
else:
print("登陆时有验证码")
save_image_path = 'D:\python_project\douban\captcha.jpeg'
urllib.request.urlretrieve(captcha_image_url[0],save_image_path)
try:
im = Image.open('D:\python_project\douban\captcha.jpeg')
im.show()
except:
pass
#手动输入验证码
captcha_solution = input('按照打开的图片输入验证码:')
formdata = {
'source':'None',
'redir':'https://www.douban.com',
'form_email':'[email protected]',
'form_password':'XXXXXXXX',
'captcha-solution':captcha_solution,
'captcha-id':captcha_id,
'login':'登陆'
}
print('loadding........')
return scrapy.FormRequest.from_response(response, headers=self.headers, formdata=formdata, callback=self.parse_after_login)
这个就是自动登陆啦。
接下来就是正儿八经的找东西爬取了,豆瓣的评论我看到都是一页一页的,然后每个人的评论都有自己的地址,如果说是直接在把评论区的东西爬取出来,你就会发现,评论一半一半的,超过字数的评论会被隐藏,所以就干脆到各个评论的链接下把评论爬取出来。
def parse_after_login(self,response):
account = response.xpath('//a[@class="bn-more"]/span/text()').extract_first()
if account is None:
print("登录失败")
else:
print(u"登录成功,当前账户为 %s" % account)
ids = [i*20 for i in range(0,28)]
for id in ids:
url = 'http://movie.douban.com/subject/26322774/reviews'+'?start={}'.format(id)
print(url)
yield Request(url=url,headers=self.headers,callback=self.parse,dont_filter=False)
def parse(self, response):
print(response.url)
ids = response.xpath('//*[@id="content"]/div/div[1]/div[1]/div/@data-cid').extract()
#print(ids)
for id in ids:
url = 'https://movie.douban.com/review/'+id+'/'
print(url)
yield Request(url=url,headers=self.headers,method="GET",callback=self.parse_page,dont_filter=False)
def parse_page(self,response):
print(response.url)
print(response.status)
comments = response.xpath('//*[@id="link-report"]/div[1]/text()').extract()
print(comments)
这就是逐层寻找页面,然后获取评论
运行的话就会打印对应的内容
在运行前我们还需要做些工作,就是数据的保存和代理IP,这种网站是很讨厌别人,丢个虫子过来爬爬爬的,不用代理的话容易被封
我们先搞IP代理,我这里的IP都是免费的,不是很稳定,可以买上一把代理IP来用。
首先来到设置文件
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
IPPOOL=[
{"ipaddr":"61.135.217.7:80"},
{"ipaddr":"114.253.36.180:9999"},
{"ipaddr":"125.70.13.77:8080"},
{"ipaddr":"121.49.110.65:8888"},
{"ipaddr":"106.75.226.36:808"},
{"ipaddr":"118.190.95.35:9001"},
{"ipaddr":"180.163.152.130:60596"},
{"ipaddr":"175.175.218.48:1133"},
{"ipaddr":"115.46.65.209:8123"},
{"ipaddr":"182.88.88.110:8123"},
{"ipaddr":"171.38.34.80:8123"},
{"ipaddr":"175.165.128.228:1133"},
{"ipaddr":"175.148.72.52:1133"},
{"ipaddr":"115.46.78.31:8123"},
{"ipaddr":"123.180.68.168:8010"},
{"ipaddr":"121.228.48.32:3128"},
{"ipaddr":"171.37.154.105:8123"},
{"ipaddr":"114.230.41.230:3128"},
{"ipaddr":"221.232.233.200:8010"},
{"ipaddr":"221.229.18.142:808"}
]
DOWNLOADER_MIDDLEWARES = {
'douban.middlewares.DoubanDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':543,
'douban.middlewares.MyProxiesSpiderMiddleware':125
}
这是对IP代理在settings.py文件中的一些设置,其中的ROBOTSTXT_OBEY和DOWNLOAD_DELA的设置是为了多些虫子被从页面中丢出来的事故,这里的爬取频率,个人建议不要低于4。
然后根据框架,我们在middlewares.py文件中开启代理
先导入头
import random
from douban.settings import IPPOOL
这里的random是为了实现随机代理
class MyProxiesSpiderMiddleware(object):
def __init__(self,ip=''):
self.ip = ip
def process_request(self,request,spider):
thisip = random.choice(IPPOOL)
print('this is ip:'+thisip['ipaddr'])
request.meta['proxy']='http://'+thisip['ipaddr']
设置好你的代理IP之后,网页就找不到你的IP,最后你的代理不够靠普,让你访问不到信息。
然后数据放到txt这是最白甜的数据存放方式了。
打开piplines.py
import os
import codecs
class DoubanPipeline(object):
def __init__(self):
self.file = codecs.open('mian.txt','a',encoding='utf-8')
def process_item(self, item, spider):
# 获取当前工作目录
self.file.write(item['comment'])
print('数据写入')
return item
def spider_closed(self):
self.file.close()
文件的话路径要是不确定的话,就写绝对路径
在settings.py中设置开启我们的piplines
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
这就完事儿了,接下里就是放着它跑吧
IP代理因为免费的,我的就不是很稳。。。。。。难受。。,,穷限制了我的虫子
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------接下来我们把数据进行处理,看看,这个片儿为啥烂。
import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
l = ''
f = open('a.txt', 'r') #这个就是你的数据源,打开数据时和数据进行截取可以使用结巴分词器
for i in f:
l += f.read()
result = jieba.analyse.textrank(l, topK=250, withWeight=True)
keyworlds = dict()
for i in result:
keyworlds[i[0]] = i[1]
# print(keyworlds)
image = Image.open('timg.jpg') #这个就是你的背景,想要好看的,背景图颜色多一点
graph = np.array(image)
wc = WordCloud(font_path='simhei.ttf', background_color='White', max_font_size=170, mask=graph)
wc.generate_from_frequencies(keyworlds)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis('off')
plt.show()
wc.to_file('1.png')
这就是全部的代码,也就几行
效果图
从图上你可以看到出现次数,也就是出现频率最高的词汇,就是~~~~~啥啥。