python操作excel文件,简单数据整合

小妹丢来3张表,要求按格式整理成一张新excel表;3张表数据杂乱不是重点,重点是工期只有半天,说多了都是泪……

数据杂乱的话,整理好数据结构,每张表去遍历一遍,然后存入自建的数据库,再从数据库里导出需要的表也是可以实现的;既然工期紧张,这些就没时间去深入研究了,直接把原来3张表人工处理一下,确保相同的行与列,直接用行列定位的方式做简单的数据整合吧。

python操作excel文件,简单数据整合_第1张图片python操作excel文件,简单数据整合_第2张图片

每张表前三行都是表头,前三列都是都是相同格式;每个企业有4行数据,经过手工处理每张表企业根据代码排序且相同,数据总量288条,最终需要整合成下面格式:

python操作excel文件,简单数据整合_第3张图片

从3个表里取出相应的数据,每个企业分类并按照时间顺序排序,操作excel前记得先装好相应的库:

import xlrd
import xlwt

根据需求,要整合的数据创建一个类:

class company(object):
    # 构造函数
    def __init__(self):
        self.time = ''
        self.code = ''
        self.name = ''
        self.D = ''  # 总经理类型
        self.E = ''  # 管理层类型
        self.F = ''  # 管理层比例
        self.G = ''  # 董事长类型
        self.H = ''  # 董事会类型
        self.I = ''  # 董事会比例
        self.J = ''  # CEO及董事长类型
        self.K = ''  # 公司规模
        self.L = ''  # 财务杠杆
        self.M = ''  # 资产净利润率
        self.N = ''  # 资本密集度
        self.O = ''  # 托宾Q
        self.P = ''  # 股利支付率
        self.Q = ''  # 行业

读取3张excel:

f_1 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\1.xls')
f_2 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\2.xlsx')
f_3 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\3.xls')
# print(f_1.sheet_names())  # 获取所有sheet

f_1Sheet1 = f_1.sheet_by_index(0)
f_2Sheet1 = f_2.sheet_by_index(0)
f_3Sheet1 = f_3.sheet_by_index(0)
# rows_1 = f_1Sheet1.row_values(0)  # 获取第1行内容
# cols_1 = Sheet1.col_values(0)  # 获取第1列内容

由于每个企业有4行数据,且最终要按照时间排序,写个分段读取函数:

# 读excel
def read_excel(start, end):
    code = ''
    time = []
    for i in range(start, end):
        if i == 291:
            time = selectionSort(time)
            find_data(code, time, start, i)
            for x in data:
                print(x)
            write_excel(data)
            break
        else:
            rows = f_1Sheet1.row_values(i)  # 获取行内容
            if code == '':
                code = rows[0]
                print(time)
                print(code)
            if code == rows[0]:
                time.append(rows[2])
                print(time)
                print(code)
            if code != rows[0]:
                time = selectionSort(time)
                start_i = start
                end_i = i
                find_data(code, time, start_i, end_i)
                break

比如说先读取第一个企业的数据read_excel(3, 8)表格第4行到第7行,然后取出时间推入数组排序,传递给find_data函数对3张表进行查找:

def find_data(code, time, start_i, end_i):
    for t in time:
        new_data = copy.deepcopy(company())
        new_data.time = t
        new_data.code = code
        for i in range(start_i, end_i):
            f_1rows = f_1Sheet1.row_values(i)  # 获取行内容
            f_2rows = f_2Sheet1.row_values(i)
            f_3rows = f_3Sheet1.row_values(i)
            if new_data.time == f_1rows[2]:
                new_data.D = f_1rows[7]
                new_data.name = f_1rows[1]
                new_data.G = f_1rows[6]
                new_data.J = f_1rows[8]
                new_data.Q = f_1rows[5]
            if new_data.time == f_2rows[2]:
                new_data.F = f_2rows[4]
                new_data.I = f_2rows[3]
                # 管理层比例不为0 那么管理层类型就写1
                if new_data.F != 0:
                    new_data.E = 1
                else:
                    new_data.E = 0
                if new_data.I != 0:
                    new_data.H = 1
                else:
                    new_data.H = 0
            if new_data.time == f_3rows[2]:
                new_data.K = f_3rows[4]
                new_data.L = f_3rows[6]
                new_data.M = f_3rows[7]
                new_data.N = f_3rows[10]
                new_data.O = f_3rows[11]
                new_data.P = f_3rows[12]

        data.append(new_data)

由于3张表企业结构统一,所以不需要逐一遍历比对,直接根据行列就能查找数据;先将数据对象推入数组,最后循环写入新文件即可。

def write_excel(data_list):
    f = xlwt.Workbook()  # 创建工作簿
    '''
    创建一个sheet:
        sheet1
    '''
    sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)  # 创建sheet
    row0 = [u'年份 ', u'股票代码', u'公司名称', u'总经理类型', u'管理层类型', u'管理层比例', u'董事长类型', u'董事会类型',
            u'董事会比例', u'CEO及董事长类型', u'公司规模', u'财务杠杆', u'资产净利润率', u'资本密集度', u'托宾Q', u'股利支付率', u'行业']

    # 生成第一行
    for i in range(0, len(row0)):
        sheet1.write(0, i, row0[i], set_style('Times New Roman', 220, True))

    row = 0
    for i in data_list:
        # sheet1.write(1,0,"newsheet")  #在(行,列)处填入数据
        print(i)
        row = row + 1
        sheet1.write(row, 0, i.time)
        sheet1.write(row, 1, i.code)
        sheet1.write(row, 2, i.name)
        sheet1.write(row, 3, i.D)
        sheet1.write(row, 4, i.E)
        sheet1.write(row, 5, i.F)
        sheet1.write(row, 6, i.G)
        sheet1.write(row, 7, i.H)
        sheet1.write(row, 8, i.I)
        sheet1.write(row, 9, i.J)
        sheet1.write(row, 10, i.K)
        sheet1.write(row, 11, i.L)
        sheet1.write(row, 12, i.M)
        sheet1.write(row, 13, i.N)
        sheet1.write(row, 14, i.O)
        sheet1.write(row, 15, i.P)
        sheet1.write(row, 16, i.Q)
    f.save('demo333.xls')  # 保存文件,文件扩展名要以2003为准,xlsx改成 xls

写一个等差数列,循环执行read_excel函数:

# 等差为4
    k = 4
    l = [i for i in range(3, 291, k)]
    print('数据分段')
    print(l)
    for i in l:
        read_excel(i, i + k + 1)

部分调试数据:

python操作excel文件,简单数据整合_第4张图片

 

最终效果:

 

python操作excel文件,简单数据整合_第5张图片

源码放上来,做个备份:

import xlrd
import xlwt
import copy


# 格式化数据对象
class company(object):
    # 构造函数
    def __init__(self):
        self.time = ''
        self.code = ''
        self.name = ''
        self.D = ''  # 总经理类型
        self.E = ''  # 管理层类型
        self.F = ''  # 管理层比例
        self.G = ''  # 董事长类型
        self.H = ''  # 董事会类型
        self.I = ''  # 董事会比例
        self.J = ''  # CEO及董事长类型
        self.K = ''  # 公司规模
        self.L = ''  # 财务杠杆
        self.M = ''  # 资产净利润率
        self.N = ''  # 资本密集度
        self.O = ''  # 托宾Q
        self.P = ''  # 股利支付率
        self.Q = ''  # 行业

data = []


def findSmallest(arr):
    smallest = arr[0]  # 将第一个元素的值作为最小值赋给smallest
    smallest_index = 0  # 将第一个值的索引作为最小值的索引赋给smallest_index
    for i in range(1, len(arr)):
        if arr[i] < smallest:  # 对列表arr中的元素进行一一对比
            smallest = arr[i]
            smallest_index = i
    return smallest_index

# 排序函数
def selectionSort(arr):
    newArr = []
    for i in range(len(arr)):
        smallest = findSmallest(arr)  # 一共要调用5次findSmallest
        # 每一次都把findSmallest里面的最小值删除并存放在新的数组newArr中
        newArr.append(arr.pop(smallest))
    return newArr


f_1 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\1.xls')
f_2 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\2.xlsx')
f_3 = xlrd.open_workbook(r'C:\Users\ysc\Desktop\3.xls')
# print(f_1.sheet_names())  # 获取所有sheet

f_1Sheet1 = f_1.sheet_by_index(0)
f_2Sheet1 = f_2.sheet_by_index(0)
f_3Sheet1 = f_3.sheet_by_index(0)
# rows_1 = f_1Sheet1.row_values(0)  # 获取第1行内容
# cols_1 = Sheet1.col_values(0)  # 获取第1列内容


# 读excel
def read_excel(start, end):
    code = ''
    time = []
    for i in range(start, end):
        if i == 291:
            time = selectionSort(time)
            find_data(code, time, start, i)
            for x in data:
                print(x)
            write_excel(data)
            break
        else:
            rows = f_1Sheet1.row_values(i)  # 获取行内容
            if code == '':
                code = rows[0]
                print(time)
                print(code)
            if code == rows[0]:
                time.append(rows[2])
                print(time)
                print(code)
            if code != rows[0]:
                time = selectionSort(time)
                start_i = start
                end_i = i
                find_data(code, time, start_i, end_i)
                break


def find_data(code, time, start_i, end_i):
    for t in time:
        new_data = copy.deepcopy(company())
        new_data.time = t
        new_data.code = code
        for i in range(start_i, end_i):
            f_1rows = f_1Sheet1.row_values(i)  # 获取行内容
            f_2rows = f_2Sheet1.row_values(i)
            f_3rows = f_3Sheet1.row_values(i)
            if new_data.time == f_1rows[2]:
                new_data.D = f_1rows[7]
                new_data.name = f_1rows[1]
                new_data.G = f_1rows[6]
                new_data.J = f_1rows[8]
                new_data.Q = f_1rows[5]
            if new_data.time == f_2rows[2]:
                new_data.F = f_2rows[4]
                new_data.I = f_2rows[3]
                # 管理层比例不为0 那么管理层类型就写1
                if new_data.F != 0:
                    new_data.E = 1
                else:
                    new_data.E = 0
                if new_data.I != 0:
                    new_data.H = 1
                else:
                    new_data.H = 0
            if new_data.time == f_3rows[2]:
                new_data.K = f_3rows[4]
                new_data.L = f_3rows[6]
                new_data.M = f_3rows[7]
                new_data.N = f_3rows[10]
                new_data.O = f_3rows[11]
                new_data.P = f_3rows[12]

        data.append(new_data)


# 写excel
def set_style(name, height, bold=False):
    # 设置单元格样式
    style = xlwt.XFStyle()  # 初始化样式
    font = xlwt.Font()  # 为样式创建字体
    font.name = name  # 'Times New Roman'
    font.bold = bold
    font.color_index = 4
    font.height = height

    # borders= xlwt.Borders()
    # borders.left= 6
    # borders.right= 6
    # borders.top= 6
    # borders.bottom= 6

    style.font = font
    # style.borders = borders

    return style


def write_excel(data_list):
    f = xlwt.Workbook()  # 创建工作簿
    '''
    创建一个sheet:
        sheet1
    '''
    sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)  # 创建sheet
    row0 = [u'年份 ', u'股票代码', u'公司名称', u'总经理类型', u'管理层类型', u'管理层比例', u'董事长类型', u'董事会类型',
            u'董事会比例', u'CEO及董事长类型', u'公司规模', u'财务杠杆', u'资产净利润率', u'资本密集度', u'托宾Q', u'股利支付率', u'行业']

    # 生成第一行
    for i in range(0, len(row0)):
        sheet1.write(0, i, row0[i], set_style('Times New Roman', 220, True))

    row = 0
    for i in data_list:
        # sheet1.write(1,0,"newsheet")  #在(行,列)处填入数据
        print(i)
        row = row + 1
        sheet1.write(row, 0, i.time)
        sheet1.write(row, 1, i.code)
        sheet1.write(row, 2, i.name)
        sheet1.write(row, 3, i.D)
        sheet1.write(row, 4, i.E)
        sheet1.write(row, 5, i.F)
        sheet1.write(row, 6, i.G)
        sheet1.write(row, 7, i.H)
        sheet1.write(row, 8, i.I)
        sheet1.write(row, 9, i.J)
        sheet1.write(row, 10, i.K)
        sheet1.write(row, 11, i.L)
        sheet1.write(row, 12, i.M)
        sheet1.write(row, 13, i.N)
        sheet1.write(row, 14, i.O)
        sheet1.write(row, 15, i.P)
        sheet1.write(row, 16, i.Q)
    f.save('demo333.xls')  # 保存文件,文件扩展名要以2003为准,xlsx改成 xls


def main():
    # 等差为4
    k = 4
    l = [i for i in range(3, 291, k)]
    print('数据分段')
    print(l)
    for i in l:
        read_excel(i, i + k + 1)
main()

 

你可能感兴趣的:(python操作excel文件,简单数据整合)