爬取网页表格数据,并且CSV文件保存

import pandas as pd
import requests
from fake_useragent import UserAgent
import random
from lxml import etree

'''解析网页数据'''

def parse_html(url):
ua = UserAgent()
print(ua.random) # 随机打印任意厂家的浏览器
headers = {
'User-Agent': ua.random,
'Referer': 'https://www.dxsbb.com'
}

try:
    resp = requests.get(url, headers=headers)
    # 将编码方式设置为从内容中分析出的响应内容编码方式
    resp.encoding = resp.apparent_encoding
    if resp.status_code == 200:
        tree = etree.HTML(resp.text)
        # 定位获取表格信息
        tb = tree.xpath('//*[@id="content"]/table')
        print("**************tb********************88", tb)
        # 将byte类型解码为str类型
        tb = etree.tostring(tb[0], encoding='utf8').decode()

        return tb
    else:
        print("出现问题")
except:
    pass

def main():
url = 'https://www.dxsbb.com/news/50354.html'
tb = parse_html(url)
print("**************8tb********************88", tb)
# 解析表格数据
df = pd.read_html(tb, encoding='utf-8', header=0)[0]
print("df.T.to_dict()********************88", df.T.to_dict())
print("df.T.to_dict().values()********************88", df.T.to_dict().values())
# 转换成列表嵌套字典的格式
result = list(df.T.to_dict().values())
print("result*****************", result)
# 保存为csv格式
df.to_csv('211_university.csv', index=False)

main()

你可能感兴趣的:(爬取网页表格数据,并且CSV文件保存)