【python爬虫实战】爬取股票数据

爬虫介绍

        什么是爬虫?简单理解来说就是抓取网络上的数据(文档、资料、图片等)。比如你考研可以爬文档和学习资料,要网络上的表格数据做分析,批量下载图片等。

爬取网站

本次爬虫实战用的网站:http://quote.stockstar.com

爬虫步骤

一、随机header

股票数据的量非常大,这里在爬取股票数据的时候,需要注意的就是反爬虫的工作。参考了很多代码,总结出比较好的思路:设置很多header,每次随机抽取一个header进行数据访问。下面给出这些header供参考。

user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

url='http://quote.stockstar.com/stock/ranklist_a_3_1_'+str(page)+'.html'
request=urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})

二、设置休眠

设置随机休眠。频繁抓取会被判定为机器,触发网页反爬机制。

time.sleep(random.randrange(1,4))

三、获取数据

stock_total = []  # 所有页面的股票数据
for page in range(1, 8):  # 下载1-7页的股票数据
    url = 'http://quote.stockstar.com/stock/ranklist_a_3_1_' + str(page) + '.html'   # 网址
    request = urllib.request.Request(url=url,
                                     headers={"User-Agent": random.choice(user_agent)})  # 随机从user_agent列表中抽取一个元素
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:  # 异常检测
        print('page=', page, '', e.code)
    except urllib.error.URLError as e:
        print('page=', page, '', e.reason)
    content = response.read().decode('gbk')  # 读取网页内容
    print('get page', page)  # 打印成功获取的页码
    pattern = re.compile('')
    body = re.findall(pattern, str(content))
    pattern = re.compile('>(.*?)<')
    stock_page = re.findall(pattern, body[0])  # 正则匹配
    stock_total.extend(stock_page)
    time.sleep(random.randrange(1, 4))  # 每抓一页随机休眠几秒,数值可根据实际情况改动
# 删除空白字符
stock_last = stock_total[:]  # stock_last为最终所要得到的股票数据
for data in stock_total:
    if data == '':
        stock_last.remove('')
# 打印部分结果
print('代码', '\t', '简称', '   ', '\t', '最新价', '\t', '   ', '涨跌幅', '\t', '   ', '涨跌额', '\t', '   ',
      '5分钟涨幅')
for i in range(0, len(stock_last), 13):  # 原网页有13列数据,所以步长为13
    print(stock_last[i], '\t', stock_last[i + 1], ' ', '\t', stock_last[i + 2], '  ', '\t', stock_last[i + 3], '  ',
          '\t', stock_last[i + 4], '  ', '\t', stock_last[i + 5])

源码

import urllib
import urllib.request
import re
import random
import time

# 抓取所需内容
user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

stock_total = []  # 所有页面的股票数据
for page in range(1, 8):  # 下载1-7页的股票数据
    url = 'http://quote.stockstar.com/stock/ranklist_a_3_1_' + str(page) + '.html'   # 网址
    request = urllib.request.Request(url=url,
                                     headers={"User-Agent": random.choice(user_agent)})  # 随机从user_agent列表中抽取一个元素
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:  # 异常检测
        print('page=', page, '', e.code)
    except urllib.error.URLError as e:
        print('page=', page, '', e.reason)
    content = response.read().decode('gbk')  # 读取网页内容
    print('get page', page)  # 打印成功获取的页码
    pattern = re.compile('')
    body = re.findall(pattern, str(content))
    pattern = re.compile('>(.*?)<')
    stock_page = re.findall(pattern, body[0])  # 正则匹配
    stock_total.extend(stock_page)
    time.sleep(random.randrange(1, 4))  # 每抓一页随机休眠几秒,数值可根据实际情况改动
# 删除空白字符
stock_last = stock_total[:]  # stock_last为最终所要得到的股票数据
for data in stock_total:
    if data == '':
        stock_last.remove('')
# 打印部分结果
print('代码', '\t', '简称', '   ', '\t', '最新价', '\t', '   ', '涨跌幅', '\t', '   ', '涨跌额', '\t', '   ',
      '5分钟涨幅')
for i in range(0, len(stock_last), 13):  # 原网页有13列数据,所以步长为13
    print(stock_last[i], '\t', stock_last[i + 1], ' ', '\t', stock_last[i + 2], '  ', '\t', stock_last[i + 3], '  ',
          '\t', stock_last[i + 4], '  ', '\t', stock_last[i + 5])

最后有需要的可以导入csv文件

你可能感兴趣的:(python,python)