Python Pandas比较BOM表数据

前言背景

年前运用Python写的小应用，借此刻写本文的机会，再次整理总结。
我们公司电子工程师会使用Mentor工具导出一张BOM表，然后相关专员会将其手动输入到SAP系统里 ( 为何不能一键导入？）。但由于数据庞杂，专员担心手动输入可能有误，于是又将SAP里数据导出，让工程师去比对确认材料、位号和数量有无错误。
有一次看到某同事一直忙于在整理Excel、比较数据，于是我就想能否用Pandas和PyQT5去写个小工具自动比较并导出结果？
工具界面十分简单，只要选择SAP数据表和Mentor数据表后，点击保存至Excel即可。有差异的地方按照自己想要的样式呈现在Excel，如此节省工程师整理比较数据时间。
SAP表里可能会有个某个大类产品下多个型号的设备BOM数据，而Mentor 表是单一型号设备数据。SAP 表每一行都是某个位号的材料信息，而Mentor 表里每行是某个Part No信息，2张表样式（部分）如下：

SAP和Mentor数据表(部分内容)

从上图中可看出SAP和Mentor数据表里都有很多列，但我们需要比较的分别是SAP 的Material,Componet(同Mentor里的PART NO) 和Installation point(同Mentor里的REF DES)，和Mentor里的PART NO,REF DES以及COUNT。
SAP Material列筛选后有4个不同的数据，50149261-003，50149261-013，50149261-023，50149261-033，它们是用于区分同类产品下的4个不同module的设备。Mentor表文件名称包含这些字段，如文件名50149261-033(SCBIP-24V)。

工具界面

小工具界面

SAP数据

我们Upload SAP BOM表时需要对Excel进行简单判断，要求其必须含有Installation point列(Mentor不含此列)。选择Mentor数据表时，要求Excel文件名称含关键字段，否则给予错误提示。

    def loadSAPData(self):
        file_Path, _ = QFileDialog.getOpenFileName(self, 'Open file', "C:\\", 'Excel files(*.xlsx , *.xls)')
        # 如果用户不选择文件，直接取消选择，
        if file_Path =='':
            return
        df = pd.read_excel(file_Path) #根据路径读取excel文件
        columns = df.columns.values.tolist() # 获取表所有的列名
        if 'Installation point' in columns: # 判断关键字在列名称列表里
            self.label_sap_file.setText(file_Path.split('/')[-1]) #文件名显示在界面上
            self.sap_df  = df 
            self.materials = df['Material'].unique() #列表，存放不同型号
            self.label_sap.setStyleSheet("QLabel{border-image: url(:icons/excel.png);}") #label设置图片
        else:
            self.showMessage()  #选择的文件不含有Installation point列，则提示错误信息

    def showMessage(self):
        self.materials = [] #清空列表里的material
        messageBox = QMessageBox()
        messageBox.setWindowIcon(QIcon(':icons/error.png')) #注意为了打包后APP上能够显示图片，此处使用冒号，icons 是项目里的一个自己创建的文件夹
        messageBox.setWindowTitle('Error Message')
        messageBox.setText(
            "This file doesn't include a column named 'Installation point'.\nWould you like reload a file? ")
        messageBox.setStandardButtons(QMessageBox.Yes | QMessageBox.No)
        buttonY = messageBox.button(QMessageBox.Yes)
        buttonY.setText('Select File')
        buttonN = messageBox.button(QMessageBox.No)
        buttonN.setText('Cancel')
        messageBox.exec_()
        if messageBox.clickedButton() == buttonY:
            self.loadSAPData()

选择文件错误提示

根据material清洗整理SAP数据

前文中说到SAP数据表里包含多个module的数据，而Mentor数据表是单个型号的数据，其文件名称里包含material字段。

首先需要根据material来筛选出不同的数据：

def getSperatedDF(self,  material):
        # 根据材料名筛选，如'50149261-003'
        df = self.sap_df.loc[self.sap_df['Material'] == material]
        # 无重复的location 数量
        count_df = df.groupby('Component', sort=False)[['Installation point']].nunique().reset_index()
        # 同一颗的不同位置拼接在一起并空格分隔
        gpby_df = df.groupby('Component', sort=False).apply(lambda x: ' '.join(x['Installation point'])).reset_index()
        # 取出'Component', 'Installation point' 2 列数据
        gpby_df.columns = ['Component', 'Installation point']

        gpby_df['SAP_COUNT'] = count_df['Installation point']
        # 对位号字符串排序
        sorted_df = self.tool.getSortedLocationsDF(gpby_df,'Installation point')
        return sorted_df

接下来需要对Installation point列各单元格里杂乱的位号字符串进行排序，如将C1 C20 C4 C3排成C1 C3 C4 C20。

    def sortLocationstr(self, installation_point_str):
        if installation_point_str !='': #判断不为空
            split_list = re.findall(r'[0-9]+|[a-zA-Z]+',installation_point_str)
            first_letters = split_list[0] #获取第一个字母，是C/R/…
            numStr_list = list(set(split_list)) # 对所有去重，用set集合方法
            numStr_list.remove(first_letters)# 去掉字母，只留下数字
            num_list= []
            for num_str in numStr_list:
                num_list.append(int(num_str))
            num_list = sorted(num_list) #数字进行排序
            locations = [] #位置
            for num in num_list:
                location = first_letters + str(num)
                locations.append(location)
            return ' '.join(locations)
        else:
            return  ''

根据列名称和df文件生成排序后的文件：

   def getSortedLocationsDF(self, df, column_name):
        for i in range(df.shape[0]): # shape[0]获取行数
            installation_point_str = df.loc[i, column_name]
            df.loc[i, column_name] = self.sortLocationstr(installation_point_str) # 对位置字符串排序
        return df

Mentor数据

    def loadMentorData(self):
        file_Path, _ = QFileDialog.getOpenFileName(self, 'Open file', "C:\\", 'Excel files(*.xlsx , *.xls)')
        # 如果用户不选择文件，直接取消选择，
        if file_Path == '':
            return
        df = pd.read_excel(file_Path)
        material = file_Path.split('/')[-1].split('.')[0]
        if '(' in material:
            material = material.split('(')[0]
        if material in self.materials:
            self.label_mentor_file.setText( file_Path.split('/')[-1])
            self.label_mentor.setStyleSheet("QLabel{border-image: url(:icons/excel.png);}")
            self.mentor_file_name = material
            self.mentor_df = df
            self.mentor_df = self.mentor_df[['PART NO','REF DES','COUNT']].fillna('') # 选出 'PART NO','REF DES','COUNT' 3列数据
            self.tool.getSortedLocationsDF(self.mentor_df,'REF DES') # 对'REF DES' 列单元格里的位置排序
            self.material_df = self.getSperatedDF(material)
        else:
            #以下代码主要是弹出错误提示
            self.mentor_file_name = ''
            messageBox = QMessageBox()
            messageBox.setWindowTitle('Error Message')
            messageBox.setWindowIcon(QIcon(':icons/error.png'))
            messageBox.setText(
                "This file isn't part of the SAP data. \nWould you like reload a file? ")
            messageBox.setStandardButtons(QMessageBox.Yes | QMessageBox.No)
            buttonY = messageBox.button(QMessageBox.Yes)
            buttonY.setText('Select File')
            buttonN = messageBox.button(QMessageBox.No)
            buttonN.setText('Cancel')
            messageBox.exec_()
            if messageBox.clickedButton() == buttonY:
                self.loadMentorData()

合并数据并比较

至此我们需要的material_df 和 mentor_df已经处理完毕，接下来需要将2者merge并进行比较。

    def mergeData(self):
        compare_df = pd.merge(self.material_df,self.mentor_df,left_on='Component',right_on='PART NO',how= 'outer')
        compare_df = compare_df.fillna('')
        compare_df['Component_Bool']  = compare_df.apply(lambda x: self.tool.compare(x['Component'],x['PART NO']),axis = 1)
        compare_df['COUNT_Bool']      = compare_df.apply(lambda x: self.tool.compare(x['SAP_COUNT'],x['COUNT']),axis = 1 )
        compare_df['Location_Bool']   = compare_df.apply(lambda x: self.tool.compare(x['Installation point'],x['REF DES']),axis = 1 )
        compare_df['Location_Change'] = compare_df.apply(lambda x: self.tool.compareLocation(str(x['Installation point']), str(x['REF DES'])), axis = 1)
        return  compare_df

这里主要比较对应的两列内容是否一致，一致就返回True，否则为False。

    def compare(self, a, b):
        if a == b:
            return 'True'
        else:
            return 'False'

比较 2个位号字符串差异并返回差异结果，如C2,C4,C7 和C2,C3,C7, 它们的差异就是C3和C4。

    def compareLocation(self, sap_loction, mentor_locatoin):
        if set(sap_loction.split(' ')).difference(mentor_locatoin.split(' ')) != {''}:
            location_list = list(set(sap_loction.split(' ')).difference(set(mentor_locatoin.split(' '))))
        else:
            location_list = list(set(mentor_locatoin.split(' ')).difference(set(sap_loction.split(' '))))
        return ' '.join(location_list)

Excel格式化

创建一个Excel 类并设置其相关格式，根据你自己的需求进行格式修改。

import numpy as np
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Side, Border, PatternFill, Font
from openpyxl.styles.colors import BLACK, RED, YELLOW


class Excel(object):

    def __init__(self,excel_path):
        super().__init__()
        self.wb = load_workbook(excel_path)
        self.ws = self.wb.active
        self.excel_path = excel_path
        self.font_name = 'Biome Light'

    def formatExcel(self):
        font        = Font(name = self.font_name , size=10, color = BLACK) #字体
        false_font  = Font(name = self.font_name, bold=True, size=10, color = BLACK) #false单元格字体
        alignment = Alignment(horizontal='center', #水平居中
                              vertical='center', #垂直居中
                              wrap_text=True,  # 文字换行
                              shrink_to_fit=False,  # 自适应宽度，改变文字大小,上一项false
                              )
        thin = Side(border_style="thin", color=BLACK) #边框线颜色
        border = Border(top=thin, left=thin, right=thin, bottom=thin) #边框线
        header_fill = PatternFill(fill_type='solid', fgColor= YELLOW ) #表头填充颜色
        false_fill = PatternFill(fill_type='solid', fgColor= RED) #false单元格填充颜色
        header_font = Font(name=self.font_name, bold=True, size=12, color=BLACK)  #表头单元格字体

        for row in self.ws.rows:
            for cell in row:
                cell.font = font
                cell.alignment = alignment
                cell.border = border
                if cell.value == 'False':
                    cell.fill = false_fill
                    cell.font = false_font
        # 如果是第一行，列名称那行
        for cell in list(self.ws.rows)[0]:
            cell.fill = header_fill
            cell.font = header_font
        self.setColumnsWidth(25)
        self.wb.save(self.excel_path)

    def setColumnsWidth(self,width):
        column = self.ws.max_column  # 获取表格列数
        numbers = np.arange(65, 65 + column)  # 大写字母A 是65
        for i in [chr(i) for i in numbers]:  #生产字母A,B,C,D……
            self.ws.column_dimensions[i].width = width

导出Excel文件

 def saveToExcel(self):
        compare_df = self.mergeData()
        sorted_df = self.tool.getSortedLocationsDF(compare_df,'Location_Change')
        sorted_df.to_excel('{}_compare.xlsx'.format(self.mentor_file_name),sheet_name= 'compare', engine = 'openpyxl',index=False)
        excel = Excel('{}_compare.xlsx'.format(self.mentor_file_name))
        excel.formatExcel() #设置格式

对比结果（部分）

打包调用图片

我们需要创建一个名称为makeqrc.py的文件，运行此文件会生成2个文件，分别是images.qrc和images.py文件。
十分重要的一点是，必须在导包的文件里import images,images 是images.py。

import subprocess, os

images = os.listdir('icons') #icons是存放图片文件的文件夹

f = open('images.qrc', 'w+')
f.write(u'\n\n\n')
for item in images:
    f.write(u'icons/'+ item +'\n')
# for item in qss:
#     f.write(u'qss/'+ item +'\n')

f.write(u'\n')
f.close()
pipe = subprocess.Popen(r'pyrcc5 -o images.py images.qrc', stdout = subprocess.PIPE, stdin = subprocess.PIPE, stderr = subprocess.PIPE, creationflags=0x08)

导包

在Terminal 里输入下面代码：

pyinstaller -F -w - i icon.ico **.py

其中icon.ico为exe文件添加图标的文件名， -w是无黑窗口，即报错时看不到打印信息。