文本数据保存

文本数据保存

  • 工具
  • 目的
  • 代码
  • 运行结果

工具

pycharm

目的

'''
网址:https://ljgk.envsc.cn/
需求:获取到地址(address),公司名字(ps_name),创建的时间(create_time),将数据分别保存在json文件和csv表格
'''

代码

# 动态数据,刷新网页后,有一个包含所有数据的XHR
import requests
import json
import csv

class Spider():
    # 初始化,text_type代表要返回的数据类型:0text,1content,2json
    def __init__(self, url, headers, file_name, data=None, json=None, text_type=0):
        self.list = []
        self.url = url
        self.headers = headers
        self.data = data
        self.json = json
        self.text_type = text_type
        self.file_name = file_name  # 要保存的文件名称,不包含后缀

    # 请求数据
    def get_data(self):
        response = requests.get(self.url, headers=self.headers, data=self.data, json=self.json)
        # 动态数据返回的是json格式
        if self.text_type == 0:
            return response.text
        elif self.text_type == 1:
            return response.content
        elif self.text_type == 2:
            return response.json()

    # 数据处理(之后补充其他情况的代码)
    def parse_data(self, text):
        # json数据处理
        for i in text:
            list_element = {}
            list_element['ps_name'] = i['ps_name']
            list_element['address'] = i['address']
            list_element['create_time'] = i['create_time']
            self.list.append(list_element)
        # print(self.list)

    # 数据保存
    def save_data(self):
        with open('{}.json'.format(self.file_name),'w',encoding='utf8') as f:
            json.dump(self.list,f,ensure_ascii=False,indent=2)

        with open('{}.csv'.format(self.file_name),'a',encoding='utf8',newline='') as f:
            fieldnames = ['ps_name', 'address', 'create_time']
            csv_writer = csv.DictWriter(f,fieldnames=fieldnames)
            csv_writer.writeheader()
            # for list_element in self.list:
            #     csv_writer.writerow(list_element)
            csv_writer.writerows(self.list)

    # 入口函数
    def run(self):
        text = self.get_data()
        self.parse_data(text)
        self.save_data()


if __name__ == '__main__':
    url = 'https://ljgk.envsc.cn/OutInterface/GetPSList.ashx?regionCode=0&psname=' \
          '&SystemType=C16A882D480E678F&sgn=2c887fad3076724ffd70d22320308a5d7b501610' \
          '&ts=1691844481490&tc=11515962'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }
    file_name = 'companies'
    spider = Spider(url, headers, file_name, text_type=2)
    spider.run()

运行结果

见资源

你可能感兴趣的:(爬虫,python,json,csv)