零一 韩要宾 黄园园 著
Scrapy是为了抓取网页数据、提取结构化数据而编写的应用框架。该框架是封装的,包含request、下载器、解析器、twisted等。对于网络内容的爬取,速度非常快。
爬取过程分为5步:
Scrapy startproject stockstar
Scrapy startproject是固定命令,在命令行中(cmd)用来创建一个名为stockstar的爬虫项目(这样创建会包含很多项目文件。方便操作)
#item.py
import scrapy
from scrapy. loader import ItemLoader
from scrapy.loader .processors import TakeFirst
class StockstarItemLoader (ItemLoader) :
#自定义itemloader,用于存储爬虫所抓取的字段内容
default_ output processor=TakeFirst()
class StockstarItem(scrapy.Item):# 建立相应的字段
# definethefieldsforyouritemhere like :
# name = scrapy .Field()
code =scrapy.Field() #股票代码
abbr = scrapy.Field() #股票简称
last_trade = scrapy.Field() #最新价
chg_ratio = scrapy.Field() #涨跌幅
chg_amt = scrapy.Field() #涨跌额
chg_ratio_5min = scrapy.Field()#5分钟涨幅
volumn = scrapy.Field() #成交量
turn_over = scrapy.Field() #成交额
from scrapy. exporters import JsonLinesItemExporter
#默认显示的中文是阅读性较差的Unicode字符
#需要定义子类显示出原来的字符集(将父类的ensure ascii属性设置为False即可)
class CustomJsonLinesItemExporter(JsonLinesItemExporter) :
def __init__(self,file,**kwargs) :
super(CustomJsonLinesItemExporter, self).__init__(file, ensure_ascii=False, ** kwargs)
#启用新定义的Exporter类
FEED EXPORTERS ={
'json': ' stockstar. settings.CustomJsonlinesItemExporter' ,
}
…
#Configure a delay for requests forthe same website (default: 0)
# See http://scrapy . readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottie settings and docs
DOWNLOAD_DELAY = 0.25
首先在CMD中输入代码,并生成spider代码:
cd stockstar
scrapy genspider stock quote.stockstar.com
在spider/stock.py文件下,定义爬虫逻辑
#spiders/storck.py
import scrapy
from i tems import StockstarItem, StockstarItemLoader
class StockSpider (scrapy.Spider):
name= ' stock' #定义爬虫名称
allowed_domains = ['quote.stockstar.com'] #定义爬虫域
start_urls = ['http://quote.stockstar.com/stock/ranklist_a_3_1_1.htm1']
#定义开始爬虫链接
def parse(self, response): #撰写爬虫逻辑
page = int(response,url.split("_")[-1].split(".")[0]) #抓取页码
item_nodes = response.css('#datalist tr)
for item_node in item_nodes:
#根据item文件中所定义的字段内容。进行字段内容的抓取
Item_loader = StockstarItemLoader (item = StockstarItem(),selector = item_node)
Item_leader.add_css("code", "td:nth-chiId(1) a::text")
Item_loader.add_css("abbr", "td:nth-child(2) a::text")
Item_loader.add_css("last_trade", "td:ath-child(3) span::text")
Item_loader.add_css("ohg_ratio", "td:nth-child(4) span::text")
Item_loader.add_css("chg_ant","td:nth-child(5) span::text")
Item_loader.add_css("chg_ratio_5min", "td:nth-child(6) span: :text")
Item_loader.add_css ("volunn",“ td:nth-chi1d(7)::text")
Item_loader.add_css("turn_over", "td;nth-chiId(8) ::text" )
stock_item = item_loader.Load_item()
yield stock item
if item nodes:
next_page = page + 1
next_url = response.url.replace("{0}.html".format (page),”{0}.html".format (next_ page))
yield scrapy.Request (url=next_url, callback=self.parse)
在stockstar文件下(cmd中)输入以下代码进行调试:
scrapy crawl stock –o item.json
Selenium爬虫主要用来爬取动态页面。
爬取电商网站数据完整代码如下:
import requests
import urllib
import time
import random
#调用浏览器对象
from selenium import webdriver
#BY库用来指定HTML文件中DOM标签元素;WebDriverWait库用来等待网页加载完成;expected_conditlons库用于指定等待页面加载结束的条件。
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditlons as EC
def get.ur1 (url) :
time.sleep (5)
return (requests.get (url))
if __rame__ ==“__main__ ":
#调用谷歌浏览器
griver = webdriver.Chrome ()
dep_cities = ["北京","上海”,"广州”, “深圳”,"天津",”杭州”,“南京",“济南",
"重庆",”青岛”,"大连”,”宁波",”厦门","成都",“武汉”,“哈尔滨","沈阳","西安”,"长春”,"长沙",”福州”,”郑州","石家庄",”苏州","佛山",”烟台”,“合肥”,"昆明”,"唐山”,”乌鲁木齐”,“兰州”,"呼和浩特","南通",“潍坊",“绍兴”,”邯郸",“东营","嘉兴”,“泰州”,“江阴",“金华”,"鞍山“,"襄阳”,“南阳”,“岳阳",“漳州","淮安",“湛江”,"柳州","绵阳"]
for dep in dep_ cities:
strhtml = get_url('https : //m.dujia.qunar.com/golfz/sight/arriveRecommend?dep=' + urllib.request.quote (dep)+&exclude= &extensionImg=255,175')
arrive_dict = strhtml.json()
for arr_item in arrive_dict['data']:
for arr_item_1 in arr_item['subModules']:
for query in arr_item_1['items']:
#使用GET方法打开网页
driver.get ("https://fh.dujia.qunar.com/ ?tf=package" )
#使用 WebDriverWait指定等待网页和最长等待时间
WebDriverWait(driver, 10).until(KC.presence_of_elementlocated((By. ID,"depCity")))
#找到出发地输入框,然后清除里面的内容
driver.find_element_by_ xpath("//*[@id= 'depCity']").clear ()
#自定义出发地填入出发地输入框
driver.find_element_by_ xpath("//*[@id= ' depCity']").send_keys (dep)
#将目的地填入目的地输入框
driver.find_element_by_xpath("//*[@id='arrCity'1").send_keys(queryl"query"]
#单击页面上的“开始定制”按钮 driver.find_element_by_xpath("/html/body/div[21/div[1]/div[2]/div[3]/div/div[2]/div/a") .click()
print("dep:8s arr:%s" % (dep, query["query"1))
for i in range(100) :
time.sleep (random.uniform(5,6))
#找不到搜索结果页,则跳出循环
pageBtns = driver.find_elements_by_xpath("html/body div[2]/div[2]/div[8]")
if pageBtns ==[] :
break
#找到对应的数据,然后分块取出数据
routes = driver.find_elements_by_xpath("html/bodydiv[2]/div[2] /div[7] /diy[2]/div")
for route in routes :
result={
'date': time.strftime('8Y-8m-od', time. localtime (time.time())),
'dep': dep,
'arrive': queryI'query'],
result: route.text
}
print (result)
#指定页码和翻页
if i < 9:
btns = driver.find_elements_by_xpath("html/body/div[2]/div[2] /div[8]/div/div/a")
for a in btns:
if a.text == u"下一页":
a.click()
break
driver.close ()
("Sometimes that thing you're searching for your whole life...it's right there by your side all along."--《Guardians of the Galaxy Vol. 2》
)