python:爬虫 :爬取股票信息应用实例并入库mysql,进而实现可视化

候选数据网站的选择
选取原则:股票信息静态存在于HTML页面中,非js代码生成,没有Robots协议限制
选取方法:浏览器F12,源代码查看等

程序的结构设计:
步骤1:从东方财富网获取股票列表
步骤2:根据股票列表 逐个到百度股票获取个股信息
步骤3:将结果存储到文件

参考视频:https://www.bilibili.com/video/av9784617?p=47

代码:

import requests
import time
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url,code='utf-8'):
    try:
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_encoding
        return r.text
    except:
        return ""

def  getStockList(lst,stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html,"html.parser")
    a = soup.find_all('a')
    for i  in a :
        # print(type(i)) #
        # s = "连板"
        # print(s)
        # if s in str(i): #判断字符串中是否包含子串
            try:
                href= i.attrs['href']
                lst.append(re.findall(r"\d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了
            except:
                continue

def getStockInfo(lst,stockURL,fpath):
    count=0 # 实现 进度条
    for stock in lst:
        url = stockURL + stock +".html" #拼接url
        html = getHTMLText(url) # 获取股票页面内容
        try:
            if html == "": #空页面的处理
                continue
            infoDict = {}
            soup = BeautifulSoup(html,'html.parser') # 解析网页
            text =soup.text
            # 正则匹配出股票名称
            stockName = re.search(r'[\w\u4e00-\u9fcc]+',text).group(0)
            # 正则匹配出股票代码
            stockNumber = re.search(r'[0-9]\d{5}',text).group(0)
            # 正则匹配出股票个股日历
            p = re.compile(r"[个][股][日][\u4e00-\u9fa5]+[\s\S]+")
            stockHistory=p.findall(text)
            print(stockHistory)
            stockHistory1= re.split(r" +",stockHistory[0])  #按照多个空格分割
            print(stockHistory1[0])
            #将“个股日历 替换为空格”
            stockHistory2 = stockHistory1[0].replace("个股日历","")
            print(stockHistory2)
            stockHistory3 = stockHistory2.replace("\n",";",100).replace("\r",";")
            print(str(stockHistory3))

            infoDict.update({'股票名称':stockName}) #将 这个信息增加到字典中
            infoDict.update({'股票代码':stockNumber}) #将 这个信息增加到字典中
            infoDict.update({'股票日历':stockHistory2}) #将 这个信息增加到字典中
            # 股票信息部分如下
            # keyList = stockInfo.find_all('dt') #键
            # valueList = stockInfo.find_all('dd') #值
            # 还原为键值对并存到字典中
            # for i in range(len(keyList)):
            #     key = keyList[i].text
            #     val = valueList[i].text
            #     infoDict[key]= val #字典可以直接使用key=value向字典中新增内容
            #将相关股票信息保存在文件中
            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                count = count +1 # 实现 进度条
                print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # \r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条
            # 每10秒抓一次数据
            time.sleep(10)
        except:
            count = count +1 # 实现 进度条
            print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')
            traceback.print_exc()
            continue



if __name__ == '__main__':
    # stock_list_url = "https://www.banban.cn/gupiao/list_sh.html"
    stock_list_url = "https://www.banban.cn/gupiao/list_sz.html"
    # stock_info_url = "http://quote.eastmoney.com/sh" #sh上证 sz深圳
    stock_info_url = "http://quote.eastmoney.com/sz"
    output_file = "D://pythontest/files/gupiao/gupiao20191212.txt"
    # slist = ['002656','002702','000001','000002']
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

抓取多个信息并入库mysql实例:

import requests
from bs4 import BeautifulSoup
import traceback
import re
from 基础用法.toMysql import ToMySql
def getHTMLText(url,code='utf-8'):
    try:
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_encoding
        return r.text
    except:
        return ""

def  getStockList(lst,stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html,"html.parser")
    a = soup.find_all('a')
    for i  in a :
        try:
            href= i.attrs['href']
            lst.append(re.findall(r"\d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了
        except:
            continue

def getStockInfo(lst,stockURL,fpath):
    count=0 # 实现 进度条
    for stock in lst:
        url = stockURL + stock +".html" #拼接url
        html = getHTMLText(url) # 获取股票页面内容
        print('==================')
        print(url)
        try:
            if html == "": #空页面的处理
                continue
            infoDict = {}
            soup = BeautifulSoup(html,'html.parser') # 解析网页
            text =soup.text
            # 正则匹配出股票名称
            stockName = re.search(r'[\w\u4e00-\u9fcc]+',text).group(0)
            # 正则匹配出股票代码
            stockNumber = re.search(r'[0-9]\d{5}',text).group(0)
            # 正则匹配出股票个股日历
            jk = re.compile(r"[今][开][::][0-9]*[.][0-9]*")
            zs = re.compile(r"[昨][收][::][0-9]*[.][0-9]*")
            zg = re.compile(r"[最][高][::][0-9]*[.][0-9]*")
            zd = re.compile(r"[最][低][::][0-9]*[.][0-9]*")
            hsl = re.compile(r"[换][手][率][::][0-9]*[.][0-9]*")
            syl = re.compile(r"[市][盈][率][::][0-9]*[.][0-9]*")


            jkV=jk.findall(text)
            zsV=zs.findall(text)
            zgV=zg.findall(text)
            zdV=zd.findall(text)
            hslV=hsl.findall(text)
            sylV=syl.findall(text)
            # print(sylV[0])

            hslV = str(hslV[0])+"%"
            print(jkV[0])
            print(jkV[0].split(":")[1])
            print(zsV[0])
            print(zgV[0])
            print(zdV[0])
            print(str(hslV))
            print(sylV[0])

            # 将数据写入mysql
            sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
            data = (stockName,stockNumber,jkV[0].split(":")[1],zsV[0].split(":")[1],zgV[0].split(":")[1],zdV[0].split(":")[1],hslV.split(":")[1],str(sylV[0].split(":")[1])) #直接写数字类型也能写入
            result = ToMySql.writeDb(sql, data)


            # 将数据写入文件中
            infoDict.update({'名称':stockName}) #将 这个信息增加到字典中
            infoDict.update({'代码':stockNumber})
            infoDict.update({'今开':jkV[0].split(":")[1]})
            infoDict.update({'昨收':zsV[0].split(":")[1]})
            infoDict.update({'最高':zgV[0].split(":")[1]})
            infoDict.update({'最低':zdV[0].split(":")[1]})
            infoDict.update({'换手率':hslV.split(":")[1]})
            infoDict.update({'市盈率':str(sylV[0].split(":")[1])})

            #将相关股票信息保存在文件中
            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                count = count +1 # 实现 进度条
                print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # \r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条
                # 每10秒抓一次数据
                # time.sleep(10)
        except:
            count = count +1 # 实现 进度条
            print('\r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')
            traceback.print_exc()
            continue



if __name__ == '__main__':
    # stock_list_url = "https://www.banban.cn/gupiao/list_sh.html"
    stock_list_url = "https://www.banban.cn/gupiao/list_sz.html"
    # stock_info_url = "http://quote.eastmoney.com/sh" #sh上证 sz深圳
    stock_info_url = "http://quote.cfi.cn/quote_"
    output_file = "D://pythontest/files/gupiao/我的股票信息.csv"
    # slist = ['002656','002702','000001','000002']
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

入库mysql封装的方法:

import pymysql
import logging
import pandas as pd

db_name = 'python'
db_user = 'root'
db_pass = 'root'
db_ip = '127.0.0.1'
db_port = 3306


#写入数据到数据库中
def writeDb(sql,db_data=()):
    """
    连接mysql数据库(写),并进行写的操作
    """
    try:
        conn = pymysql.connect(db=db_name,user=db_user,passwd=db_pass,host=db_ip,port=int(db_port),charset="utf8")
        cursor = conn.cursor()
    except Exception as e:
        print(e)
        logging.error('数据库连接失败:%s' % e)
        return False

    try:
        cursor.execute(sql, db_data)
        conn.commit()
    except Exception as e:
        conn.rollback()
        logging.error('数据写入失败:%s' % e)
        return False
    finally:
        cursor.close()
        conn.close()
    return True


#
# sql = """ INSERT INTO user(email,last_name) VALUES(%s,%s) """
# data = ("[email protected]", "男")
# result = writeDb(sql, data)


sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
data = ("1","2","3","4","5","6","7","8") #直接写数字类型也能写入
result = writeDb(sql, data)

Stocks信息表:

 CREATE TABLE
    stock_infos
    (
        id INT NOT NULL AUTO_INCREMENT COMMENT '主键',
        stock_name VARCHAR(30),
        stock_code VARCHAR(30),
        jk VARCHAR(10) COMMENT '今开',
        zs VARCHAR(10) COMMENT '昨收 ',
        zg VARCHAR(10) COMMENT '最高 ',
        zd VARCHAR(10) COMMENT '最低',
        hsl VARCHAR(10) COMMENT '换手率',
        syl VARCHAR(10) COMMENT '市盈率',
        PRIMARY KEY (id)
    )
    ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

python:爬虫 :爬取股票信息应用实例并入库mysql,进而实现可视化_第1张图片

 

爬取股票信息实现百分比进度条:

python:爬虫 :爬取股票信息应用实例并入库mysql,进而实现可视化_第2张图片

你可能感兴趣的:(python,python,爬虫)