使用Python爬虫获取上交所和深交所所有股票的名称和交易信息

使用Python爬虫获取上交所和深交所所有股票的名称和交易信息

功能描述

目标

获取上交所和深交所所有股票的名称和交易信息

输出

保存到文件中

技术路线

requests-bs4-re

候选数据网站的选择``

网易财经:https://money.163.com/stock/
新浪股票:http://finance.sina.com.cn/stock

选取原则

股票信息静态存在HTML页面中,非js代码生成,没有robots协议限制

选取方法

浏览器F12,源代码查看等

程序的结构设计

步骤1:

从网易财经获取股票列表

步骤2:

根据股票列表逐个到网易财经获取个股信息

步骤3:

将结果存储到文件

import re
import requests
from bs4 import BeautifulSoup
codepath = 'code.txt'
def gethtml(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ''
def getcodelist(url, page):
    for p in range(0, page+1):
        codeurl = url+str(p)
        html = gethtml(codeurl)
        als = re.findall(r'[0-9]*', html)
        for a in als:
            soup = BeautifulSoup(a, 'html.parser')
            code = soup.get_text()
            with open(codepath, 'a') as c:
                c.write(code + '\n')
                c.close()

def savestocklist(stockurl):
    with open(codepath,'r') as c:
        for r in c.readlines():
            try:
                url = stockurl+'0'+r.replace('\n', '')+'.html'
                html = gethtml(url)
                soup = BeautifulSoup(html, 'html.parser')
                tittles = soup.select("body  script")
                stock_tag = tittles[4]
                stl = stock_tag.get_text().replace(' ', '')
                st = re.findall(r'\'.*\'', stl)
                stock = {
                    'name': st[0],
                    'code': st[1],
                    'price': st[2],
                    'change': st[3],
                    'yesterday': st[4],
                    'today': st[5],
                    'high': st[6],
                    'low': st[7]}
                with open('stock.txt', 'a', encoding='utf-8') as t:
                    t.write(str(stock)+'\n')
                    t.close()
            except:
                url = stockurl + '1' + r.replace('\n', '') + '.html'
                html = gethtml(url)
                soup = BeautifulSoup(html, 'html.parser')
                tittles = soup.select("body  script")
                stock_tag = tittles[4]
                stl = stock_tag.get_text().replace(' ', '')
                st = re.findall(r'\'.*\'', stl)
                stock = {
                    'name': st[0],
                    'code': st[1],
                    'price': st[2],
                    'change': st[3],
                    'yesterday': st[4],
                    'today': st[5],
                    'high': st[6],
                    'low': st[7]}
                with open('stock.txt', 'a', encoding='utf-8') as t:
                    t.write(str(stock) + '\n')
                    t.close()

def main():
    url = 'http://quotes.money.163.com/data/caibao/yjyg_00.html' \
          '?reportdate=20190930&sort=reportdate&order=desc&page='
    stockurl ='http://quotes.money.163.com/'
    page = 2
    getcodelist(url, page)
    savestocklist(stockurl)

main()


你可能感兴趣的:(python,爬虫)