本文不是python、scrapy的教程,而是分享一个好玩的点子。
python教程请看python教程,scrapy教程请看scrapy教程
爬取豆瓣高分电影教程参考python爬虫入门笔记:用scrapy爬豆瓣
本文爬的是即将上映电影
先看一下推送的效果图,这个是针对手机上qq和微信写的CSS,其它的应该没办法看,很乱。。。
不幸的是这两部电影都。。。
qq绑定qq邮箱的推送 微信绑定qq邮箱的推送
默认你已经安装好scrapy了啊,用anaconda装比较方便,mysql安装:
pip install mysql-connector
只粘一些关键的代码了,不太清楚的看上面给的python爬虫入门笔记
数据库设计如下,用来存放电影的信息
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for upcomingfilm
-- ----------------------------
DROP TABLE IF EXISTS `upcomingfilm`;
CREATE TABLE `upcomingfilm` (
`id` bigint(255) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`date` varchar(255) DEFAULT NULL,
`wish` int(10) DEFAULT NULL,
`description` varchar(500) DEFAULT NULL,
`duration` int(10) DEFAULT NULL,
`region` varchar(100) DEFAULT NULL,
`director` varchar(255) DEFAULT NULL,
`actors` varchar(255) DEFAULT NULL,
`type` varchar(100) DEFAULT NULL,
`poster` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
items.py 用来接收爬取到电影的参数
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field
class RecommendItem(scrapy.Item):
# define the fields for your item here like:
title = Field() # 电影名
description = Field() # 电影简介
date = Field() # 上映日期
wish = Field() # 想看人数
duration = Field() # 电影时长
region = Field() # 哪国的电影
director = Field() # 导演
actors = Field() # 演员
poster = Field() # 海报
type = Field() # 类型
然后爬取一周左右时长内的电影,爬多了也不好,可能会有新的电影加进来,爬的注释也懒的写了,当时也是写给自己用的,就没怎么写注释。。。爬取的逻辑中yield request会自动让pipelines.py将数据插入到数据库
# -*- coding: UTF-8 –*-
import datetime
import chardet
import mysql
import scrapy
from mysql import connector
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TCPTimedOutError
from recommend.items import RecommendItem
class upcomingfilms(scrapy.Spider):
name = 'upcomingfilms'
# 豆瓣即将上映电影的URL
start_urls = [
'https://movie.douban.com/coming'
]
def parse(self, response):
text = response.body
content_type = chardet.detect(text)
if content_type['encoding'] != 'UTF-8':
text = text.decode(content_type['encoding'])
text = text.encode('utf-8')
films = response.css('tbody')[1].css('tr')
#爬取一周左右时长内的电影
nextWeekTime = (datetime.datetime.now() + datetime.timedelta(days=8)).strftime('%m%d') # 一周后的周一
nextMonth = nextWeekTime[0:2]
nextDay = nextWeekTime[2:4]
for film in films:
item = RecommendItem()
date = film.css('td::text')[0].extract().strip()
if date[0:2] == nextMonth and int(date[3:5]) > int(nextDay):
break
filmtype = film.css('td::text')[3].extract()
region = film.css('td::text')[4].extract()
wish = film.css('td::text')[5].extract()
title = film.css('a::text')[0].extract()
filmDetail = film.css('a::attr(href)')[0].extract()
item['date'] = str(date).strip()
tempTitle = str(title).strip()
if len(tempTitle) > 6:
tempTitle = tempTitle[0:6]+'...'
item['title'] = tempTitle
item['type'] = str(filmtype).strip()
item['region'] = str(region).strip()
split = str(wish).strip().index('人')
item['wish'] = int(str(wish).strip()[0:split])
request = scrapy.Request(url=filmDetail, callback=self.parse_detail)
request.meta['item'] = item
yield request
def parse_detail(self, response):
director = response.css('#info a::text')[0].extract()
description = response.css('#link-report span::text')[0].extract()
actorsSpan = response.css('#info span')[8]
actors = actorsSpan.css('a::text').extract()
poster = response.css('#mainpic img::attr(src)')[0].extract()
duration = response.css("#info span[property='v:runtime']")[0].css('::attr(content)')[0].extract()
item = response.meta['item']
item['director'] = str(director)
desc = str(description).strip()
if len(desc) > 110:
desc = desc[0:110] + '...'#超出字数就用省略号
item['description'] = desc
item['actors'] = str('/'.join(actors[0:3]))
item['poster'] = str(poster)
item['duration'] = int(duration)
return item
pipelines.py用了保存电影信息
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import mysql
class RecommendPipeline(object):
def __init__(self):
#数据库信息自己填
self.conn = mysql.connector.connect(host='', port=3306, user='root', password='', database='',charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
title = item.get('title')
description = item.get('description')
date = item.get('date')
wish = item.get('wish')
duration = item.get('duration')
region = item.get('region')
director = item.get('director')
actors = item.get('actors')
poster = item.get('poster')
filmtype = item.get('type')
insert_sql = "insert into upcomingfilm(title, description, date, wish, duration, region, director, actors, poster, type)VALUES (%s, %s, %s, %s, %s,%s, %s, %s, %s,%s)"
self.cursor.execute(insert_sql, (title, description, date, wish, duration, region, director, actors, poster, filmtype))
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
最后是发送邮件,本文最有价值的东东,就是这个我自己设计的CSS,其它东西都能百度到,爬即将上映的应该也有人写了吧,要发送邮件别忘了申请smtp口令。发送邮件分几个步骤,1、连接数据库把想看人数大于5000的筛选出来,我不想看烂片,哈哈,这个参数你可以自己调。2、将数据填充到模板,用邮件发送,我这里只发送给自己,当然也支持群发。如果你的机器定时发送比较麻烦,并且想要每周接收即将上映的电影信息可以评论留下邮箱。3、清空数据库的内容,以便每次得到新的内容。其实这种实时性的东西用不着数据库,放在一个list中排序就行了,我只是懒得百度那种方法了。。。
# -*- coding: UTF-8 –*-
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
import mysql.connector
#数据库属性自己填哦
conn = mysql.connector.connect(host='', port=3306, user='root', password='', database='',charset='utf8')
cursor = conn.cursor()
cursor.execute('select * from upcomingfilm where wish > 5000 order by wish desc')
values = cursor.fetchall()
print(values)
head = '''
即将上映
'''
content='''
{1}
导演:
{2}
主演:
{3}
类型:
{4}
{5}
|
{6}
分钟
上映日期:
{7}
想看人数:
{8}
'''
for value in values:
str = content.format(value[10],value[1],value[7],value[8],value[9],value[6],value[5],value[2],value[3],value[4])
head+=str
foot='''
'''
head+=foot
print(head)
my_sender='[email protected]' # 发件人邮箱账号
my_pass = '' # 发件人邮箱密码(当时申请smtp给的口令)
my_user='[email protected]' # 收件人邮箱账号,我这边发送给自己
def mail(content):
ret = True
try:
msg=MIMEText(content, 'html', 'utf-8')
msg['From'] = formataddr(["细肥尸丁", my_sender]) # 括号里的对应发件人邮箱昵称、发件人邮箱账号
msg['To'] = formataddr(["粪鸡",my_user]) # 括号里的对应收件人邮箱昵称、收件人邮箱账号
msg['Subject'] = "即将上映" # 邮件的主题,也可以说是标题
server = smtplib.SMTP_SSL("smtp.qq.com", 465) # 发件人邮箱中的SMTP服务器,端口是465
server.login(my_sender, my_pass) # 括号中对应的是发件人邮箱账号、邮箱密码
server.sendmail(my_sender, [my_user, ], msg.as_string()) # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
server.quit() # 关闭连接
except Exception: # 如果 try 中的语句没有执行,则会执行下面的 ret=False
ret = False
return ret
mail(head)
cursor.execute('delete from upcomingfilm')
conn.commit()
cursor.close()
conn.close
linux上用crontab定时推送,不详细解释,自行百度,注意要用绝对路径,样例如下:
4 9 * * 1,3,5 cd /root/python/recommend && /root/anaconda3/bin/scrapy crawl upcomingfilms
5 9 * * 1,3,5 cd /root/python/recommend/recommend && /root/anaconda3/bin/python send_mail.py
windows上用计划任务。
注意执行发送邮件的脚本要比爬取电影信息的脚本晚1分钟
可能就算给自己推荐了即将上映的电影也没money去影院看,不过没关系,一部电影上映没几天我就能在网上找到资源,哈哈哈
还是老老实实继续做java吧。。。因为懒得主动去看有什么电影即将上映,就学了一手python、scrapy,linux。。。写了个这玩意儿推送给自己。
项目完整代码可以在https://download.csdn.net/download/qq_37518622/10588266下载,改一下数据库配置