测试环境:py3+win10,不同环境可能会有些许差异。
# coding=utf-8
from pandasql import sqldf
import pandas as pd
from itertools import product
import numpy as np
from random import choices
import os
class SheetSplit():
def __init__(self,features=[],file='',sheetname='Sheet1',out_dir=''):
self.features = features #筛选特征
self.file = file #读取的excel完整路径
self.sheet_name = sheetname #读取的工作表名称
self.sql_where = ' = "{}" and '.join(self.features) + ' = "{}"' #特征条件sql字符串
if out_dir == '': #文件切分输出路径,默认为excel所在路径
self.out_dir = os.path.dirname(file)
else:
self.out_dir = out_dir
def mkfile(self):
'''生成测试文件'''
area = ['广州', '深圳', '东莞', '佛山']
fruits = ['apple', 'orange', 'banana', 'watermelon','pear']
df = pd.DataFrame(np.random.randint(30,150,size=50),columns=['销售'])
df['水果'] = choices(fruits,k=50)
df['地区'] = choices(area,k=50)
df = df.reindex(columns=['地区','水果','销售'])
df.to_excel(os.path.join(self.out_dir,'fruits_test.xlsx'),index=False)
def not_file_exists(self,df):
'''判断,输出不存在的特征列表'''
out_features = []
columns = df.columns
for each in self.features:
if not each in columns:
out_features.append(each)
return out_features
def feature_product(self,df):
'''生成不同筛选特征的笛卡尔列表'''
split_feature = self.features
split_feature_list = []
for each in split_feature:
split_feature_list.append(df[each].unique())
feature_product = product(*split_feature_list)
return feature_product
def split(self,feature_product):
'''切分工作表到out_dir文件目录下'''
global df
split_file_dir = os.path.join(self.out_dir,'EXCEL切分文件')
os.mkdir(split_file_dir)
os.chdir(split_file_dir)
pysqldf = lambda x: sqldf(x, globals())
for feature_select in feature_product:
sql = 'select * from df where {}'.format(self.sql_where).format(*feature_select)
split_df = pysqldf(sql)
if split_df.shape[0] > 0:
split_df.to_excel('-'.join(feature_select)+'.xlsx',index=False)
print('正在导出文件:{}'.format('-'.join(feature_select)+'.xlsx'))
def main(self):
global df
if len(self.file) == 0 or len(self.features) == 0:
print('您未输入切分特征选项或文件路径,请检查后再操作......')
exit()
'''文件读取'''
df = pd.read_excel(self.file,sheet_name=self.sheet_name)
'''检查输入列表特征是否存在于数据源特征'''
out_features = self.not_file_exists(df)
if len(out_features) > 0:
print('您输入的以下特征不存在,请重新输入:{}'.format(';'.join(out_features)))
exit()
'''生成不同特征的笛卡尔积列表'''
feature_product = self.feature_product(df)
'''sql记录筛选,导出文件'''
self.split(feature_product)
if __name__ == '__main__':
m = SheetSplit(features=['地区','水果'],
file=r"D:\桌面\fruits_test.xlsx"
)
m.main()