python 使用pandas或xlrd、xlwt实现对Excel的读取、添加、追加、删除、重复值处理、缺失值处理、获取行数、列数、获取一行、一列数据等一系列封装

不说了,又是造轮子的一天。在此我要严重批评CSDN或百度一堆浑水摸鱼的,某些人明明代码明显报错也来上传发博客,要么就是标题党,代码没报错但压根就不是实现那个功能的,简直是浪费时间。

python 使用pandas或xlrd、xlwt实现对Excel的读取、添加、追加、删除、重复值处理、缺失值处理、获取行数、列数、获取一行、一列数据等一系列封装_第1张图片

废话不多说直接贴代码:

先新建一个data文件夹,文件路径我填的是 ./data/

使用xlrd、xlwt:

import xlrd
import xlwt
import xlutils.copy


class XLSX:
    __workbook = xlwt.Workbook(encoding='utf-8')
    __sheet = __workbook.add_sheet('Sheet1', cell_overwrite_ok=True)

    # 保存
    def save(self, name, head_list, data_list):
        path = './data/' + name + '.xlsx'
        head = head_list
        data = data_list
        __row = 0
        __col = 0
        for head in head:
            self.__sheet.write(__row, __col, head)
            __col = __col + 1
        for i in range(0, len(data)):
            for j in range(1, len(data[i]) + 1):
                self.__sheet.write(j, i, data[i][j - 1])
        self.__workbook.save(path)

    # 在原来的基础上增加 n 列,包含表头
    @staticmethod
    def save_append_col(name, data_list, head):
        path = './data/' + name + '.xlsx'
        file = xlrd.open_workbook(path)
        table = file.sheet_by_index(0)
        col_len = table.ncols
        ws = xlutils.copy.copy(file)
        ws_table = ws.get_sheet(0)
        ws_table.write(0, col_len, head)
        for i in range(1, len(data_list) + 1):
            ws_table.write(i, col_len, data_list[i - 1])
        ws.save(path)

    # 在原来的基础上增加 n 列,包含表头,如果存在就在该列下面继续添加
    @staticmethod
    def save_append_col_append(name, data_list, head, line_index):
        path = './data/' + name + '.xlsx'
        file = xlrd.open_workbook(path)
        table = file.sheet_by_index(0)
        cols = table.row_values(0)
        if head in cols:
            col_len = cols.index(head)
            ws = xlutils.copy.copy(file)
            ws_table = ws.get_sheet(0)
            for i in range(0, len(data_list)):
                ws_table.write(line_index + i - 1, col_len, data_list[i])
            ws.save(path)
        else:
            col_len = table.ncols
            ws = xlutils.copy.copy(file)
            ws_table = ws.get_sheet(0)
            ws_table.write(0, col_len, head)
            for i in range(1, len(data_list) + 1):
                ws_table.write(i, col_len, data_list[i - 1])
            ws.save(path)

    # 在原来的基础上增加 n 行,这里是按照每一行的方式追加
    @staticmethod
    def save_append_row(name, data_list):
        path = './data/' + name + '.xlsx'
        file = xlrd.open_workbook(path)
        table = file.sheet_by_index(0)
        row_len = table.nrows
        col_len = table.ncols
        ws = xlutils.copy.copy(file)
        ws_table = ws.get_sheet(0)
        for i in range(0, len(data_list)):
            for j in range(0, col_len):
                ws_table.write(row_len + i, j, data_list[i][j])
        ws.save(path)

    # 获取文件所有数据
    @staticmethod
    def get_all_line(file_name):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        table = file.sheets()[0]
        a = []
        for i in range(1, table.nrows):
            a.append(table.row_values(i))
        return a

    # 获取文件所有数据,以行的方式
    @staticmethod
    def get_all_line_col(file_name):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        sheet = file.sheet_by_index(0)  # 获取第一个表格
        result = []
        rows = sheet.nrows
        for row_idx in range(rows):
            row_values = sheet.row_values(row_idx)
            result.append(row_values)
        return result

    # 获取文件指定一列的所有数据
    @staticmethod
    def get_all_col(file_name, col_index):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        table = file.sheets()[0]
        a = []
        for i in range(1, table.nrows):
            a.append(table.row_values(i)[col_index])
        return a

    # 获取文件指定一行的所有数据
    @staticmethod
    def get_one_line(file_name, line_count):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        t1 = file.sheets()[0]
        return t1.row_values(line_count)

    # 获取文件指定一行的指定一列的数据
    @staticmethod
    def get_one_line_col(file_name, line_count, col_count):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        t1 = file.sheets()[0]
        return t1.row_values(line_count)[col_count]

    # 获取文件总行数
    @staticmethod
    def get_row_length(file_name):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        table = file.sheet_by_index(0)
        return table.nrows

    # 获取文件指定一列的长度,不包含空值
    @staticmethod
    def get_one_col_length(file_name, col_index):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        table = file.sheets()[0]
        col_len = 0
        for row_idx in range(table.nrows):
            cell_value = table.cell_value(row_idx, col_index)
            if cell_value != '':
                col_len += 1
        return col_len

    # 获取总列数
    @staticmethod
    def get_col_length(file_name):
        file = xlrd.open_workbook('./data/' + file_name + '.xlsx')
        table = file.sheet_by_index(0)
        return table.ncols

    # 获取重复列表的下标,假如第1/6/8行相同,5/9行相同,则返回[[1,6,8],[5,9]]
    def get_repeat_list(self, file_name, col_index):
        return self.find_duplicates(self.get_all_col(file_name, col_index))

    # 删除行,[行数1,[行数2]]
    @staticmethod
    def delete_line(file_name, delete_rows):
        path = './data/' + file_name + '.xlsx'
        workbook = xlrd.open_workbook(path)
        sheet = workbook.sheet_by_index(0)
        rows_num = sheet.nrows
        rows_data = []
        for r in range(rows_num):
            if r not in delete_rows:
                rows_data.append(sheet.row_values(r))
        book = xlwt.Workbook()
        sheet1 = book.add_sheet('Sheet1', cell_overwrite_ok=True)
        # 将未删除行数据写入sheet
        for r, row in enumerate(rows_data):
            for c, val in enumerate(row):
                sheet1.write(r, c, val)
        book.save(path)

    # 对所有列所有行进行缺失值处理,使用“缺失补充”来替代
    @staticmethod
    def deletion(file_name):
        path = './data/' + file_name + '.xlsx'
        workbook = xlrd.open_workbook(path)
        sheet = workbook.sheet_by_index(0)
        rows_num = sheet.nrows
        cols_num = sheet.ncols
        data = []
        for r in range(rows_num):
            row = []
            for c in range(cols_num):
                value = sheet.cell_value(r, c)
                if value == '':  # 如果数据为空,认为是缺失值
                    value = '缺失补充'  # 替换为'缺失'字符串
                row.append(value)
            data.append(row)
        book = xlwt.Workbook()
        sheet1 = book.add_sheet('Sheet1', cell_overwrite_ok=True)
        for r in range(rows_num):
            for c in range(cols_num):
                sheet1.write(r, c, data[r][c])
        book.save(path)

    @staticmethod
    def find_duplicates(strs):
        result = []
        strs_dic = {}
        for index, s in enumerate(strs):
            if s not in strs_dic:
                strs_dic[s] = [index]
            else:
                strs_dic[s].append(index)
        for key in strs_dic:
            if len(strs_dic[key]) > 1:
                result.append(strs_dic[key])
        return result


# 测试
# xx = XLSX()
# xx.save('ddd', ['表头1', '表头2', '表头3'],
#         [['列1-第1行', '列1-第2行', '列1-第3行'], ['列2-第1行', '列2-第2行', '列2-第3行'],
#          ['列3-第1行', '列3-第2行', '列3-第3行']])
# xx.save_append_row('ddd', [['列1-第4行', '列2-第4行', '列3-第4行'], ['列1-第5行', '列2-第5行', '列3-第5行'],
#                            ['列1-第6行', '列2-第6行', '列3-第6行']])
# xx.save_append_col('ddd', ['列4-第1行', '列4-第2行', '列4-第3行'], '表头4')
# xx.save_append_col_append('ddd', ['列4-第1行', '列4-第2行', '列4-第3行'], '表头4', 5)
# xx.delete_line('ddd', [5, 6])
# xx.deletion('ddd')
# print(xx.get_all_line('ddd'))
# print(xx.get_one_line('ddd', 2))
# print(xx.get_all_col('ddd', 3))
# print(xx.get_one_line_col('ddd', 1, 0))
# print(xx.get_row_length('ddd'))
# print(xx.get_col_length('ddd'))
# print(ee.get_repeat_list('ddd',1))
# print(xx.get_one_col_length('ddd', 3))

使用pandas:

import os
import numpy as np
import pandas as pd


class EXCEL:

    # 保存
    @staticmethod
    def save(name, head_list, data_list):
        path = r'./data/' + name + '.xlsx'
        temp = {}
        for i in range(len(head_list)):
            temp[str(head_list[i])] = data_list[i]
        data = pd.DataFrame(temp)
        data.to_excel(path, index=False)

    # 在原来的基础上增加一个sheet表
    @staticmethod
    def append_sheet(name, sheet_name, head_list):
        write = pd.DataFrame(columns=head_list)
        path = r'./data/' + name + '.xlsx'
        if not os.path.exists(path):
            write = pd.ExcelWriter(path, engine='openpyxl')
        with pd.ExcelWriter(path, mode='a', engine='openpyxl') as writer:
            write.to_excel(writer, sheet_name=sheet_name, index=False)

    # 在原来的基础上增加 n 行,包含表头,这里是按照每一列的方式追加
    @staticmethod
    def append_row(name, head_list, data_list):
        path = r'./data/' + name + '.xlsx'
        temp = {}
        for i in range(len(head_list)):
            temp[str(head_list[i])] = data_list[i]
        old_data = pd.read_excel(path)
        new_data = pd.DataFrame(temp)
        result = pd.concat([old_data, new_data])
        result.to_excel(path, index=False)

    # 在原来的基础上增加 n 列
    @staticmethod
    def append_col(name, head_name, data_list):
        path = r'./data/' + name + '.xlsx'
        df = pd.read_excel(path)
        new_col = pd.Series(data_list)
        new_col.name = head_name
        result = pd.concat([df, new_col], axis=1)
        result.to_excel(path, header=True, index=False)

    # 获取文件所有数据,以行的方式
    @staticmethod
    def get_all_line(file_name):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        return df.values.tolist()

    # 获取文件所有数据,以列的方式
    @staticmethod
    def get_all_line_col(file_name):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        cols = df.columns.tolist()
        result = []
        for col in cols:
            result.append(df[col].values.tolist())
        return result

    # 获取文件指定一列的所有数据
    @staticmethod
    def get_all_col(file_name, col_index):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        col = df.iloc[:, col_index]
        return col.values.tolist()

    # 获取文件指定一行的所有数据
    @staticmethod
    def get_one_line(file_name, line_count):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        row = df.iloc[line_count]
        return row.values.tolist()

    # 获取文件指定一行的指定一列的数据
    @staticmethod
    def get_one_line_col(file_name, line_count, col_count):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        row = df.iloc[line_count]
        col = row[col_count]
        if isinstance(col, np.float64):
            col = pd.Series(col)
        return col

    # 获取文件总行数
    @staticmethod
    def get_row_length(file_name):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        total_rows = df.shape[0]
        return total_rows

    # 获取文件总列数
    @staticmethod
    def get_col_length(file_name):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        total_cols = df.shape[1]
        return total_cols

    # 删除行,[行数1,[行数2]]
    @staticmethod
    def delete_line(file_name, delete_rows):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        df.drop(df.index[delete_rows], inplace=True)
        df.to_excel(path, index=False)

    # 获取重复列表的下标,假如第1/6/8行相同,5/9行相同,则返回[[1,6,8],[5,9]]
    def get_repeat_list(self, file_name, col_index):
        return self.find_duplicates(self.get_all_col(file_name, col_index))

    # 对指定列 进行缺失值处理
    @staticmethod
    def deletion(file_name, col_head_name):
        path = r'./data/' + file_name + '.xlsx'
        df = pd.read_excel(path)
        # num_col_type = df[col_head_name].dtype
        # if num_col_type == 'int64' or num_col_type == 'float64':
        #     df[col_head_name] = df[col_head_name].fillna(df[col_head_name].median())
        # else:
        #     df[col_head_name] = df[col_head_name].fillna('缺失')
        # 不使用np.nan,也不判断类型了,统一用 “缺失补充”来替代
        df[col_head_name] = df[col_head_name].fillna('缺失补充')
        df.to_excel(path, index=False)

    @staticmethod
    def find_duplicates(strs):
        result = []
        strs_dic = {}
        for index, s in enumerate(strs):
            if s not in strs_dic:
                strs_dic[s] = [index]
            else:
                strs_dic[s].append(index)
        for key in strs_dic:
            if len(strs_dic[key]) > 1:
                result.append(strs_dic[key])
        return result


# 测试
# ee = EXCEL()
# ee.save('ggg', ['表头1', '表头2', '表头3'],
#         [['列1-第1行', '列1-第2行', '列1-第3行'], ['列2-第1行', '列2-第2行', '列2-第3行'],
#          ['列3-第1行', '列3-第2行', '列3-第3行']])
#
# ee.append_sheet('ggg', 'Sheet2', ['表头1', '表头2', '表头3'])
#
# ee.append_row('ggg', ['表头1', '表头2', '表头3'],
#               [['列1-第1行', '列1-第2行'], ['列2-第1行', '列2-第2行'], ['列3-第1行', '列3-第2行']])
#
# ee.append_col('ggg', '表头4', ['列4-第1行', '列4-第2行', '列4-第3行'])
# ee.delete_line('ggg', [2, 3])
# ee.deletion('ggg', '表头4')
#
# print(ee.get_all_line('ggg'))
# print(ee.get_one_line('ggg', 2))
# print(ee.get_all_col('ggg', 3))
# print(ee.get_one_line_col('ggg', 1, 0))
# print(ee.get_row_length('ggg'))
# print(ee.get_col_length('ggg'))
# print(ee.get_repeat_list('ggg',1))
# ee.repeat_count('ggg')

选一个就行了,不要把两份不同的写法混在一起用,两份写法有些是不一样的

最后我想说:


python 使用pandas或xlrd、xlwt实现对Excel的读取、添加、追加、删除、重复值处理、缺失值处理、获取行数、列数、获取一行、一列数据等一系列封装_第2张图片

你可能感兴趣的:(python,pandas,excel)