首先我们先抓包分析一下,可以看到我们想要的每一页的全部数据都在"article"下。
而其中每一部的电影的数据可以看到在"info"下。
所以我们只要在info下找到自己的目标数据并想好匹配方法即可,本文使用的是xpath,其实也可以在spiders中导入pyquery或者BeautifulSoup来进行匹配,当然正则也是可以的。现在我们去找到目标数据。
这些便是我们的目标数据,接下来便可以动手爬取了。
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 电影名
title = scrapy.Field()
# 电影信息
movieInfo = scrapy.Field()
# 评分
star = scrapy.Field()
# 影评
quote = scrapy.Field()
# 评分人数
number = scrapy.Field()
pass
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from Mycrawl.items import MovieItem
import requests
import time
class MovieSpider(Spider):
# 爬虫名字,重要
name = 'movie'
# 反爬措施
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
url = 'https://movie.douban.com/top250'
#start_urls = ['movie.douban.com']
def start_requests(self):
# url = 'https://movie.douban.com/top250'
yield Request(self.url, headers=self.headers, callback=self.parse)
def parse(self, response):
item = MovieItem()
selector = Selector(response)
movies = selector.xpath('//div[@class="info"]')
for movie in movies:
name = movie.xpath('div[@class="hd"]/a/span/text()').extract()
message = movie.xpath('div[@class="bd"]/p/text()').extract()
star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
number = movie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()
quote = movie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
if quote:
quote = quote[0]
else:
quote = ''
item['movie_name'] = ''.join(name)
item['movie_message'] = ';'.join(message).replace(' ','').replace('\n','')
item['movie_star'] = star[0]
item['number'] = number[1].split('人')[0]
item['movie_quote'] = quote
yield item
nextpage = selector.xpath('//span[@class="next"]/link/@href').extract()
time.sleep(3)
if nextpage:
nextpage = nextpage[0]
yield Request(self.url+str(nextpage), headers=self.headers, callback=self.parse)
import pymysql
import pymongo
'''
class MycrawlPipeline(object):
def process_item(self, item, spider):
return item
'''
class MoviePipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Movie')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Movie (name,movieInfo,star,number,quote) \
VALUES (%s,%s,%s,%s,%s)", (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s" % (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
return item
class MoviePipeline(object):
def __init__(self):
# 连接数据库
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.test = self.client['TESTDB']
self.post = self.test['movie']
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
在爬虫运行前需要建立数据库表格,有两种方法可以建立,一种是可以通过命令行进入数据库输入创建表命令创建;另一种是通过Python进行创建。
在这里我说说第二种,创建一个.py文件,然后在里面编写
import pymysql
db = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='你的密码', db='TESTDB', charset='utf8')
cursor = db.cursor()
cursor.execute('DROP TABLE IF EXISTS BOOK')
sql = """CREATE TABLE BOOK(
id INT NOT NULL PRIMARY KEY AUTO_INCREMENT COMMENT '自增 id',
book_name VARCHAR(1024) NOT NULL COMMENT '小说名',
author VARCHAR(1024) NOT NULL COMMENT '小说作者',
book_type VARCHAR(1024) NULL NULL COMMENT '小说类型',
book_state VARCHAR(1024) DEFAULT NULL COMMENT '小说状态',
book_update VARCHAR(1024) DEFAULT NULL COMMENT '小说更新',
book_time VARCHAR(1024) DEFAULT NULL COMMENT '更新时间',
new_href VARCHAR(1024) DEFAULT NULL COMMENT '最新一章',
book_intro VARCHAR(1024) DEFAULT NULL COMMENT '小说简介',
createtime DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '添加时间'
)"""
cursor.execute(sql)
db.close()
这样,就完成了创建一个表了。
关于更多的Mysql数据库操作命令,可以参考 http://blog.csdn.net/Mr_blueD/article/details/79344462
在创建的scrpay项目的文件夹下输入
scrapy crawl movie(这个movie是我在Spider里写的爬虫名)
另有爬取起点小说网数据的Scrapy实战,请前往 http://blog.csdn.net/Mr_blueD/article/details/79343349