【爬虫入门】股票数据爬取

  • 需修改output_file变量
  • 东方财富网 + 腾讯证券
import re
import requests
import traceback
from bs4 import BeautifulSoup

def getHtmlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("访问失败")
        return ""
    
def getStockList(ls, stockurl):
    html = getHtmlText(stockurl)
    soup = BeautifulSoup(html, "html.parser")
    for i in soup.find_all('a'):
        try:
            href = i.attrs['href']
            ls.append(re.findall(r'[s][hz]\d{6}', href)[0])
        except:
            continue
    
def getStockInfo(ls, stockurl, fpath):
    for stock in ls:
        url = stockurl + stock + "/gp"
        html = getHtmlText(url)
        try:
            if html == "":
                continue
            infoDict = {}
            soup = BeautifulSoup(html, 'html.parser')
            stockName = soup.find('div', attrs={'class':'title_bg'})
            stockInfo = soup.find('div', attrs={'class':'col-2 fr'})
            name = stockName.find_all(attrs={'class':'col-1-1'})[0]
            if name.text.split()[0] =='--':
                continue
            infoDict.update({'股票名称':name.text.split()[0]})
            info = stockInfo.find_all('li')
            for i in info:
                key = re.findall('>.*?<', str(i))[1][1:-1]
                key = key.replace('\u2003','')
                key = key.replace('\xa0','')                
                try:
                    val = re.findall('>.*?<', str(i))[3][1:-1]
                except:
                    val = '--'
                infoDict[key] = val
            
            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
        except:
            #traceback.print_exc()
            continue
    
def main():
    stock_list_url = 'http://quote.eastmoney.com/stock_list.html'
    stock_info_url = 'http://gu.qq.com/'
    output_file = '/home/lwy/Spiders/stockinfo.txt'
    slist = []
    getStockList(slist, stock_list_url)
    getStockInfo(slist, stock_info_url, output_file)
    
main()

你可能感兴趣的:(Spiders)