day13
selenium可以实现抓取动态数据
scrapy不能抓取动态数据, 如果是ajax请求, 可以请求接口, 如果是js动态加载, 需要结合selenium
import scrapy
from selenium import webdriver
from ..items import WynewsItem
from selenium.webdriver import ChromeOptions
class NewsSpider(scrapy.Spider):
name = 'news'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://news.163.com/domestic/']
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\news\wynews\wynews\spiders\chromedriver.exe', options=option)
# bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\news\wynews\wynews\spiders\chromedriver.exe')
def detail_parse(self, response):
content_list = response.xpath('//div[@id="endText"]/p//text()').extract()
content = ''
title = response.meta['title']
for s in content_list:
content += s
item = WynewsItem()
item["title"] = title
item["content"] = content
yield item
def parse(self, response):
div_list = response.xpath('//div[contains(@class, "data_row")]')
for div in div_list:
link = div.xpath('./a/@href').extract_first()
title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
yield scrapy.Request(url=link, callback=self.detail_parse, meta={"title":title})
# 中间件编码:
from scrapy.http import HtmlResponse
class WynewsDownloaderMiddleware(object):
def process_response(self, request, response, spider):
bro = spider.bro
if request.url in spider.start_urls:
bro.get(request.url)
time.sleep(3)
js = 'window.scrollTo(0, document.body.scrollHeight)'
bro.execute_script(js)
time.sleep(3)
response_selenium = bro.page_source
return HtmlResponse(url=bro.current_url, body=response_selenium, encoding="utf-8", request=request)
return response
# Pipeline编码:
import pymongo
class WynewsPipeline(object):
conn = pymongo.MongoClient('localhost', 27017)
db = conn.wynews
table = db.newsinfo
def process_item(self, item, spider):
self.table.insert(dict(item))
return item
# 介绍:
1.pipelines: 用于数据持久化
2.数据持久化的方式有很多种: MongoDB, MySQL, Redis, CSV
3.必须实现的方法: process_item
# 核心方法讲解:
open_spider(self, spider): spider开启是被调用
close_spider(self, spider): spider关闭是被调用
from_crawler(cls, crawler): 类方法, 用@classmethod标识, 可以获取配置信息
Process_item(self, item, spider): 与数据库交互存储数据, 该方法必须实现 *****
# MongoDB交互:
import Pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db['news'].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
# MySQL交互:
import pymysql
def MysqlPipeline(object):
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(self, crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST')
database = crawler.settings.get('MYSQL_DATABASE')
user = crawler.settings.get('MYSQL_USER')
password= crawler.settings.get('MYSQL_PASSWORD')
port = crawler.settings.get('MYSQL_PORT')
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf-8', port=self.port)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
sql = 'insert into %s (%s) values (%s)' % (table, keys, value)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
# 用于文件下载的管道类
# spider编码:
import scrapy
from ..items import XhxhItem
class XhSpider(scrapy.Spider):
name = 'xh'
# allowed_domains = ['www.baidu.com']
start_urls = ['http://www.521609.com/qingchunmeinv/']
def parse(self, response):
li_list = response.xpath('//div[@class="index_img list_center"]/ul/li')
for li in li_list:
item = XhxhItem()
link = li.xpath('./a[1]/img/@src').extract_first()
item['img_link'] = 'http://www.521609.com' + link
print(item)
yield item
# items编码:
import scrapy
class XhxhItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_link = scrapy.Field()
# 管道编码:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class XhxhPipeline(object):
def process_item(self, item, spider):
return item
class ImgPipeLine(ImagesPipeline):
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['img_link'])
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
return item
# settings编码:
ITEM_PIPELINES = {
'xhxh.pipelines.XhxhPipeline': 300,
'xhxh.pipelines.ImgPipeLine': 301,
}
IMAGES_STORE = './mvs'
mpleted(self, results, item, info):
return item
```Python
# settings编码:
ITEM_PIPELINES = {
'xhxh.pipelines.XhxhPipeline': 300,
'xhxh.pipelines.ImgPipeLine': 301,
}
IMAGES_STORE = './mvs'