Sub ExtractBySite()
Dim Wb1, Wb2, Wb3 As Workbook
Dim cPath$, siteName$, newFilePath$
cPath = ThisWorkbook.Path & "\" '获取本文件所在路径
Set Wb1 = ThisWorkbook '如果放在本文件中执行则直接用这个读取站点信息
'提示选择数据文件
MsgBox ("请选择要拆分的数据文件")
FilePathFull = Application.GetOpenFilename("所有文件 (*.*), *.*", 0, "选定文件", , False)
If FilePathFull <> "False" Then
Filename = Right(FilePathFull, Len(FilePathFull) - InStrRev(FilePathFull, "\"))
FilePath = Left(FilePathFull, Len(FilePathFull) - Len(Filename))
FileHead = Left(Filename, InStrRev(Filename, "-") - 1)
'MsgBox "FilePath:" & FilePath & vbCrLf & "FileName:" & Filename & vbCrLf & "Filehead:" & FileHead
End If
Set Wb2 = Workbooks.Open(FilePathFull)
Application.ScreenUpdating = False '关闭屏闪,加速处理
'逐个获取sheet(FileHead)第i个站点的名字,去FileHead数据文件中读取站点数据,保存到以站点命名的新文件中
For i = 2 To Wb1.Sheets(FileHead).UsedRange.Rows.Count
siteName = Wb1.Sheets(FileHead).Cells(i, 2).Value
'循环读WB2数据到WB3
a = 2
For j = 2 To Wb2.Sheets(1).UsedRange.Rows.Count
If Wb2.Sheets(1).Cells(j, 1) = siteName Then
If a = 2 Then
'创建新表WB3
Set Wb3 = Workbooks.Add()
'复制表头到新表WB3
Wb2.Sheets(1).Rows(1).Copy
Wb3.Sheets(1).Cells(1, 1).PasteSpecial
Application.CutCopyMode = False '取消选择
'复制数据行
Wb2.Sheets(1).Cells(j, 1).EntireRow.Copy Destination:=Wb3.Sheets(1).Cells(a, 1)
Else
Wb2.Sheets(1).Cells(j, 1).EntireRow.Copy Destination:=Wb3.Sheets(1).Cells(a, 1)
End If
Application.CutCopyMode = False '取消选择,才能选中复制下一行
a = a + 1
End If
Next j
'保存数据到新表WB3文件
If a > 2 Then
newFilePath = cPath & siteName & ".xlsx"
Wb3.SaveAs (newFilePath)
Wb3.Close (False)
End If
Next i
Application.ScreenUpdating = True
MsgBox "拆分完毕!", vbInformation, "提示"
End Sub
python:
# -*- coding: utf-8 noBOM-*-
"""
@用法:1. 修改 refColumname引号中的值为'被拆分文件'里要遵照拆分的字段名
2. 输入'被分割文件'的路径+名称,不需要加引号。
3. 会自动在要拆分文件的同目录下创建以文件名的‘—’前字符为名称的目录,并输出拆分的文件
@Created on 2020/4/10
@author: abraham
"""
import pandas as pd
import time
import os
from myDebug.myPrint import myPrint
#打印调试信息开关
DEBUG_LEVEL=0 #please refer to myDebug.myPrint
def splitExcel(splitFilename:str, refColumname:str):
myPrint(1, "============Begin to split the file================")
starttime = time.time()
fileHead = ''.join(splitFilename.split('\\')[-1:]).split('-')[0]
filePath = "\\".join(splitFilename.split('\\')[:-1])+"\\"+fileHead+"\\"
if not os.path.exists(filePath):
os.makedirs(filePath)
myPrint(1, splitFilename, refColumname,splitFilename[-3:],filePath)
if splitFilename[-3:] == "csv":
data = pd.read_csv(splitFilename)
else:
io = pd.io.excel.ExcelFile(splitFilename)
data = pd.read_excel(io,sheetname=1) #多sheet下,放在io里读取速度要比这样读取快pd.read_excel(splitFilename)
io.close()
rows = data.shape[0] #获取行数,不含表头; shape[1]获取列数
myPrint(DEBUG_LEVEL, "data rows:", rows)
site_dict = {}
site_list = []
newFileDir_dict = {}
#遍历splitFilename,存成字典newFileDir_dict,把同一个站点的数据存在一个key值下的第二列的DataFrame里
for i in range(rows):
if i%10000==0: #减少输出调试信息占用的时间,大幅缩短程序运行时间
myPrint(1, i, "of rows", rows, "run time:%s Seconds"%int(time.time()-starttime))
site_name = data[refColumname][i] #是从数据行开始,所以i不能从1开始
if site_name not in site_dict: #转成dict可以把时间缩短一半
site_list.append(site_name)
site_dict = dict.fromkeys(site_list,True) #用了这句可以把时间缩短一半
newFileDir_dict[site_name] = [filePath+"\\"+site_name+".xlsx", pd.DataFrame()]
newFileDir_dict[site_name][1] = newFileDir_dict[site_name][1].append(data.iloc[[i],:],ignore_index = True)
#即使开关为0也严重影响了运行速度,因此注销myPrint(DEBUG_LEVEL, "data.iloc[[i],:]:\n", data.iloc[[i],:]) #每一行数据都会打印出表头
myPrint(DEBUG_LEVEL, "newFileDir_dict:", newFileDir_dict, '\n', newFileDir_dict.items(), '\n', newFileDir_dict.keys())
#按字典key值保存成excel文件
for site_name in newFileDir_dict.keys():
newFileDir = newFileDir_dict[site_name][0]
newFileDir_dict[site_name][1].to_excel(newFileDir, sheet_name=site_name, index = False, columns = data.columns.values)
myPrint(1, "Generate file: "+newFileDir)
myPrint(1, "============End of the split the file================")
if __name__ == '__main__' :
print("====用法:")
print("====1.直接执行py程序的:请先修改文件中refColumname引号中的值为'被分割文件'里字段的名字")
print("====2.输入'被分割文件'的路径+名称,不需要加引号,可以使用复制粘贴,须右键点对话框边框")
print("====3.会自动在要拆分文件的同目录下创建以文件名的‘—’前字符为名称的目录,并输出拆分的文件\n")
splitFilename = input("请输入要拆分的文件路径及名称:")
refColumname = input("请输入要按照哪个字段名拆分:")
splitExcel(splitFilename, refColumname)
myPrint是参考网上的封装了print的一个模块,目的是增加记录时间的功能,并能通过开关控制是否打印日志。 ↩︎