爬取上市公司财务报表

from urllib import request
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd


def Data_collector(url):
    # url = "http://quotes.money.163.com/f10/zycwzb_600519.html#01c02"
    # url = "http://quotes.money.163.com/f10/zcfzb_600519.html#01c05"
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = request.Request(url=url, headers=headers)
    res = request.urlopen(req)
    print(res.status)
    html_data = res.read().decode("utf-8")
    Soup = bs(html_data, "html.parser")
    
    #请求访问
    #%%
    # 爬取第一列分类属性
    p = Soup.find("table", class_="table_bg001 border_box limit_sale")
    n_all = p.find_all("td", class_="td_1")
    n = [n.text for n in n_all]
    
    # 爬取时间行
    date_table = Soup.find("table", class_="table_bg001 border_box limit_sale scr_table")
    date_tr = date_table.find("tr", class_="dbrow")
    date_th_all = date_tr.find_all("th")
    date = [th.text for th in date_th_all]
    # print("date: ", len(date))
    
    # 爬数据
    content = []
    d = Soup.find("table", class_="table_bg001 border_box limit_sale scr_table")
    tr_all = d.find_all("tr")
    for tr in tr_all:
        td_all = tr.find_all("td")
        td_data = [td.text for td in td_all]
        if len(td_all) != 0:
            content.append(td_data)
    
    # content = np.array(content)
    content = pd.DataFrame(content)
    
    content.index = pd.Series(n)
    content.columns = date
    
    print("done")
    return content


if __name__ == "__main__":
    url_list = ['http://quotes.money.163.com/f10/zcfzb_600519.html#01c05',
                'http://quotes.money.163.com/f10/lrb_600519.html#01c06',
                'http://quotes.money.163.com/f10/xjllb_600519.html#01c07']
    Data_all = pd.DataFrame()
    for i in url_list:
        temp = Data_collector(i)
        Data_all = Data_all.append(temp)

print("done")

 

你可能感兴趣的:(爬取上市公司财务报表)