爬取股票数据

import requests
from bs4 import BeautifulSoup
import pymongo
client = pymongo.MongoClient(host='localhost',port=27017,connect=False)
stocks = client['stock']
stock_data = stocks['stock_data']
if "stock_data" in stocks.list_collection_names():
    stock_data = stocks['stock_data']
    stock_data.drop()
else:
    stock_data = stocks['stock_data']

def getcodeUrl(url):
    nameLi = []
    hrefLi = []
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    items = soup.select('a')
    server = 'http://quote.cfi.cn/'
    for item in items[:-8]:
        name = item.text
        href = server + item['href']
        nameLi.append(name)
        hrefLi.append(href)
        #print(name)
    return nameLi,hrefLi


def getcodeInfo(name,code_url):
    response = requests.get(code_url)
    soup = BeautifulSoup(response.text, 'lxml')
    #开始解析数据
    datas = soup.select('td',id_="act_quote")
    #print(len(datas))
    infos = datas[29:42]
    #print(infos)
    try:
        c0 = infos[0].text.split(":")[1]
        c1 = infos[1].text.split(":")[1]
        c2 = infos[2].text.split(":")[1]
        c3 = infos[3].text.split(":")[1]
        c4 = infos[4].text.split(":")[1]
        c5 = infos[5].text.split(":")[1]
        c6 = infos[6].text.replace("手","").split(":")[1]
        c7 = infos[7].text.replace("万元","").split(":")[1]
        if infos[8].text.split(":")[1] not in ["--", '正无穷大']:
            c8 = infos[8].text.split(":")[1]
        else:
            c8 = "0"

        if infos[9].text.replace("计算公式说明","").split(":")[1] not in ["--",'正无穷大']:
            c9 = infos[9].text.replace("计算公式说明","").split(":")[1]
        else:
            c9 = "0"
        c10 = infos[10].text.split(":")[1]
        if len(infos[11].text.split(":")) >= 2:
            c11 = infos[11].text.split(" ")[1].split(":")[1].replace("元", "")
        else:
            c11 = "0"
        #行业信息
        h_info = infos[12].text.replace("平均市盈率",":平均市盈率").replace("扣除后平均市盈率",":扣除后平均市盈率").replace("扣除后:",":扣除后").split(":")
        c12 = h_info[1]
        c13 = h_info[3]
        c14 = h_info[5]


        #产业链接http://quote.cfi.cn/quotelist.aspx?sortcol=stockcode&sortway=asc&bklb=hy&bkid=17400066
        c15 = infos[12].select('a')[0]['href'].replace("quotelist.aspx?bklb=hy&bkid=","http://quote.cfi.cn/quotelist.aspx?sortcol=stockcode&sortway=asc&bklb=hy&bkid=")

        data = {
     
            '股票名称':name,
            '今开': float(c0),
            '最高': float(c1),
            '增幅': float(c2.strip("%")),
            '还手率':float(c3.strip("%")),
            '昨收': float(c4),
            '最低': float(c5),
            '成交量': float(c6),
            '成交金额': float(c7),
            '市盈率':float(c8),
            '扣除后市盈率':float(c9),
            '市净率':float(c10),
            '每股收益':float(c11),
            '所属行业':c12,
            "所属产业链接":c15,
            '平均市盈率':float(c13),
            '扣除后平均市盈率':float(c14)
        }
        #print(data)
        stock_data.insert_one(data)
    except Exception as e:
        print(e)


urlList = ['http://quote.cfi.cn/stockList.aspx?=t' + str(n) for n in range(2,18)]  #按照股票分类获取股票名称列表链接


n = 1
for  url in urlList:
    #print(getcodeUrl(url))
    names = getcodeUrl(url)[0]
    hrefs = getcodeUrl(url)[1]
    for name,href in zip(names,hrefs):
        getcodeInfo(name,href)
        print("插入成功第%d条数据"%n,name)
        n += 1


你可能感兴趣的:(爬虫,人工智能,爬取新闻)