scrapy,selenium,webdriver,datetime,re,python的orm框架sqlalchemy
cmd命令行
创建scrapy爬虫项目:scrapy startproject sina sina.com
cd进入 sina文件夹
创建爬虫spider:scrapy genspider sina1
初始化url_list
初始化page和flag参数
初始化self.options——关于webdriver的一些参数
def __init__(self,page=None,flag=None,*args,**kwargs):
#一些参数初始化
super(Sina1Spider,self).__init__(*args,**kwargs)
self.page = int(page) #爬多少页,代码里写不科学,作为函数参数传进来
self.flag = int(flag) #具体数值在哪里写入呢?在main.py文件的命令里写入
self.start_urls = ['https://news.sina.com.cn/china/',
'https://ent.sina.com.cn/film/',
'https://ent.sina.com.cn/zongyi/',
'https://ent.sina.com.cn/star/',
'http://eladies.sina.com.cn/']
self.option = webdriver.ChromeOptions()
self.option.add_argument('headless') #不打开浏览器
self.option.add_argument('no-sandbox') #不打开沙盒
self.option.add_argument('--blink-setting=imagesEnabled=false') #不要图片
def start_requests(self):
"""
从初始化方法的start_url中解析出单个url
并通过yield关键字产生Request对象,通过callback参数回调给parse方法
"""
for url in self.start_urls:
yield Request(url=url,callback=self.parse)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
item = DataItem()
title = driver.find_elements_by_xpath("//h2[@class='undefined']/a[@target='_blank']")
eachtime = eachtime.replace('今天',str(today.month)+'月'+str(today.day)+'日')
yesterday = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
yield Request(response.urljoin(href),meta={'name':item},callback=self.parse_namedetail)
def parse(self, response):
"""
解析内容
:param response:
:return:
"""
driver = webdriver.Chrome(chrome_options=self.option)
driver.set_page_load_timeout(30)
driver.get(response.url)
#以上是webdriver,打开浏览器的一些设置,知道打开哪个网页
# 【核心操作】 双重for循环:
# 第一个for循环次数为page的页数:
# 每一页不会一下子全部加载出来,有一个向下滑动加载更多的效果
# 第二个for循环次数为每页中小新闻标题的个数:
# 每次生成一个针对这个小新闻标题的Request对象,通过callnack参数传给下一个函数parse_namedetail
for i in range(self.page):
while not driver.find_element_by_xpath("//div[@class='feed-card-page']").text: #直到翻页的按钮出现
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") #滚动条往下滑动网页
title = driver.find_elements_by_xpath("//h2[@class='undefined']/a[@target='_blank']")
time = driver.find_elements_by_xpath("//h2[@class='undefined']/../div[@class='feed-card-a feed-card-clearfix']/div[@class='feed-card-time']")
#不再细分,直接用DataItem
for i in range(len(title)):
eachtitle = title[i].text
eachtime = time[i].text
item = DataItem()
if response.url == "https://ent.sina.com.cn/zongyi/":
item['type'] = 'zongyi'
elif response.url == "https://news.sina.com.cn/china/":
item['type'] = 'news'
elif response.url == "https://ent.sina.com.cn/film/":
item['type'] = 'film'
elif response.url == "https://ent.sina.com.cn/star/":
item['type'] = 'star'
elif response.url == "http://eladies.sina.com.cn/":
item['type'] = 'nvxing'
item['title'] = eachtitle
item['desc'] = ''
href = title[i].get_attribute('href')
today = datetime.datetime.now()
eachtime = eachtime.replace('今天',str(today.month)+'月'+str(today.day)+'日') #把'今天xx:xx'格式转换成年月日的标准形式
#在首页对日期进行处理,日期的显示有不同情况,分类讨论
if '分钟前' in eachtime:
minute = int(eachtime.split('分钟前')[0])
t = datetime.datetime.now() - datetime.timedelta(minutes=minute)
t2 = datetime.datetime(year=t.year,month=t.month,day=t.day,hour=t.hour,minute=t.minute)
elif '年' not in eachtime:
eachtime = str(today.year) + '年' + eachtime
t1 = re.split(r'[日月年:]',eachtime)
t2 = datetime.datetime(year=int(t1[0]),month=int(t1[1]),day=int(t1[2]),
hour=int(t1[3]),minute=int(t1[4]))
item['times'] = t2
#判断跑全量还是跑增量,1为增量,0为全量
if self.flag == 1:
today = datetime.datetime.now().strftime("%Y-%m-%d")
yesterday = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
if item['times'].strftime("%Y-%m-%d") < yesterday: #昨天之前的内容不要
driver.close()
break
elif yesterday <= item['times'].strftime("%Y-%m-%d") < today:
yield Request(response.urljoin(href),meta={
'name':item},callback=self.parse_namedetail)
else:
yield Request(response.urljoin(href),meta={
'name':item},callback=self.parse_namedetail)
# 生成了一个Request对象,通过参数callback回调给parse_namedetail函数
#点击下一页
try:
driver.find_element_by_xpath("//div[@class='feed-card-page']/span[@class='pagebox_next']/a").click()
except:
break
map(func,Iterator)
map函数将迭代器iterator放入func更新成新的迭代器yield item
,由pipeline接收,并由pipeline里面的process_item
方法进行数据持久化操作 def parse_namedetail(self, response):
"""
从每一个单独的新闻页面中解析出time、desc、item
:param response:
:return:
"""
#css selector选取元素
selector = Selector(response)
item = response.meta['name']
desc = selector.xpath("//div[@class='article']/p/text()").extract()
#处理desc
desc = list(map(str.strip,desc))
item['desc'] = ''.join(desc)
yield item
用来接收spider里面的item
import scrapy
class DataItem(scrapy.Item):
title = scrapy.Field()
desc = scrapy.Field()
times = scrapy.Field()
type = scrapy.Field()
Base = declarative_base()
class Data(Base):
__tablename__ = 'data'
id = Column(Integer(), primary_key=True)
times = Column(DateTime)
title = Column(Text())
content = Column(Text())
type = Column(Text())
# 初始化数据库引擎,并将其绑定
self.engine = create_engine('mysql+pymysql://root:123456@localhost:3306/sina', encoding='utf-8')
Base.metadata.create_all(self.engine)
self.DBSession = sessionmaker(bind=self.engine)
3.提交
session = self.DBSession()
session.add(new)
session.commit()
完整代码
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column,create_engine,Text,DateTime,String,Integer
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class Data(Base):
__tablename__ = 'data'
id = Column(Integer(), primary_key=True)
times = Column(DateTime)
title = Column(Text())
content = Column(Text())
type = Column(Text())
class SinaPipeline:
def __init__(self):
# 初始化数据库引擎,并将其绑定
self.engine = create_engine('mysql+pymysql://root:123456@localhost:3306/sina', encoding='utf-8')
Base.metadata.create_all(self.engine)
self.DBSession = sessionmaker(bind=self.engine)
def process_item(self, item, spider):
new = Data()
new.title = item['title']
new.times = item['times']
new.content = item['desc']
new.type = item['type']
session = self.DBSession()
session.add(new)
session.commit()
return item
main文件
需要传入spider文件init方法里要求的两个arg
用-a page=10 -a flag=0
格式传入,注意空格
from scrapy import cmdline
cmdline.execute('scrapy crawl sina1 -a page=10 -a flag=0'.split())