Python批量处理excel文件

1.导库

其中xlrd是read_excel需要的。

import zipfile
import os
import pandas as pd
import os
import xlrd
import json
import pygeoip

2. 批量解压zip文件

  由于给定的文件夹中文件鱼龙混杂,还有文件名中包括空格(这是多大的怨念才这么命名的啊),所以必须加上异常处理,然后通过后缀名去解压。

def unzip_folder_all_zip(src_root):
    for rt, dirs, files in os.walk(src_root):
        for single_file in files:
            filename = single_file.split('.')[0]
            suffix = single_file.split('.')[1]
            
            try:
                if suffix == 'zip':
                    single_file_path = os.path.join(rt, single_file)
                    print('zip single file path is {}'.format(single_file_path))
                    zip_ref = zipfile.ZipFile(single_file_path, 'r')
                    extract_path = rt + '\\'+ filename
                    zip_ref.extractall(extract_path)
                    zip_ref.close()
            except Exception as e:
                print(e)

3. 批量读取excel文件

  给我的excel文件是以.csv为后缀名的。(无语ing)。
  由于部分excel文件有问题,所以也需要加上异常处理。
  最早的版本采用的是先读一个空文件,然后再进行concat操作(这种方式太麻烦)

def read_folder_all_excel(src_root):
    df_all = pd.DataFrame()
    
    for rt, dirs, files in os.walk(src_root):
        for single_file in files:
            suffix = single_file.split('.')[-1]
            
            if suffix == 'csv':
                single_file_path = os.path.join(rt, single_file)
                print('single file path is {}'.format(single_file_path))
                try:
                    single_df = pd.read_excel(single_file_path, sheetname=u'特征检测')
                except Exception as e:
                    print(e)
                df_all = df_all.append(single_df, ignore_index=True)

    return df_all

4. 保存汇总数据

  由于读数据是逐个文件逐一读取、合并到大的DataFrame中。所以如果把DataFrame保存为pickle文件,在下次使用的时候就可以节约大量的时间。对于DataFrame来说,有很方便的API,具体如下所示:

df_all.to_pickle(r'df_all.pkl')

4.1 第二次读取数据

import pandas as pd
df_all = pd.read_pickle(r'df_all.pkl')

5. 统计总频次和分类统计频次

5.1 统计总频次

  统计之后,需要写入文件,此处用的是csv文件,注意编码应为utf-8-sig,如果用utf-8保存,用excel打开就会是乱码。

def get_total_events_times(df_all, file_save_path):
    total_events_times = df_all['事件名称'].value_counts() #得到的是Series类型数据
    total_events_times_df = total_events_times.to_frame()
    total_events_times_df.reset_index(inplace=True)
    total_events_times_df.columns=['事件名称', '发生次数']

    total_events_times_df.to_csv(file_save_path, index = False, encoding = 'utf_8_sig')

5.2 分类统计频次

def get_category_events_times(df_all, file_save_path):
    category_events_times = df_all.groupby('安全类型')['事件名称'].agg('value_counts')
    category_events_times_df = category_events_times.to_frame()
    category_events_times_df.columns = ['发生次数']
    category_events_times_df.to_csv(file_save_path, encoding = 'utf_8_sig')

6. 分类统计

6.1 获取相关类别

jmr_df = df_all[(df_all['安全类型'] == '蠕虫病毒') | (df_all['安全类型'] == '木马后门')]

6.2 过滤数据

  滤除不包括webshell的事件,其中~表示不包含的意思。

jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('ebshell')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('ebShell')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('php')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('扫描')]

6.3 分类处理

  如果对多列数据进行综合处理,可参考下列代码:

existed_event_df = pd.read_excel(r'fall_computer.xlsx')
jmr_df_merged = jmr_df.merge(existed_event_df, on = '事件名称')

def get_fall_computer_ip(source_ip, dest_ip, is_source, fall_flag = True):
    if fall_flag == True: # 失陷主机
        if is_source == 1:
            return source_ip

        else:
            return dest_ip

    else:# 攻击主机
        if is_source == 1:#如果源是失陷主机
            return dest_ip #则攻击主机就是目的IP
        else:
            return source_ip


jmr_df_merged['失陷主机IP'] = jmr_df_merged.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], True), axis = 1)
jmr_df_merged['攻击主机IP'] = jmr_df_merged.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], False), axis = 1)

6.3.2 添加IP地址对应的位置

  当前路径需要有GeoLiteCity.dat文件,下载地址为 https://download.csdn.net/download/herosunly/11195594 。

class GeoIP(object):
    def __init__(self, db="GeoLiteCity.dat"):
        self.gi = pygeoip.GeoIP(db, pygeoip.MEMORY_CACHE)

    def region(self, ip):
        res = {}
        info = self.gi.record_by_addr(ip)
        if not info:
            return res
        res["country"] = info["country_code"]
        res["city"] = info["city"]
        return res

  一个bug是字典没有使用get操作,有时候就会失败。
  一个效率低的原因是,每一次生成GeoIP对象,速度就会很慢。所以先生成实例。

geoip = GeoIP()
jmr_df_merged['失陷主机IP国家'] = jmr_df_merged['失陷主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
jmr_df_merged['失陷主机IP城市'] = jmr_df_merged['失陷主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
jmr_df_merged['攻击主机IP国家'] = jmr_df_merged['攻击主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
jmr_df_merged['攻击主机IP城市'] = jmr_df_merged['攻击主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))

攻击主机国家的相关代码:

jmr_attack_country = jmr_df_merged['攻击主机IP国家'].value_counts().to_frame().reset_index()
jmr_attack_country.columns=['country', 'times']

失陷主机城市的相关代码:

jmr_fall_computer_city = jmr_df_merged[jmr_df_merged['失陷主机IP国家'] == 'CN']['失陷主机IP城市'].value_counts().to_frame().reset_index()
jmr_fall_computer_city.columns = ['city', 'times']

6.3.3 缩写和拼音映射成汉语

  由于失陷主机的地址对应的是中国的城市,但程序只给定的是拼音,所以需要把拼音映射成汉语词汇。
  其中city.json文件下载地址为 https://download.csdn.net/download/herosunly/11195483 (文件是对的,文件名写错了)

import json
with open('city.json', encoding = 'utf-8') as json_data:
    city_dict = json.load(json_data)

jmr_fall_computer_city['city_name' ]= jmr_fall_computer_city['city'].map(lambda x: city_dict.get(x, 'unknown'))
jmr_fall_computer_city.to_csv('jmr_fall_computer_city.csv', encoding = 'utf-8-sig', index = False)
#再根据常识进行简单的修改即可

  由于攻击主机的地址对应的是中国的城市,但程序只给定的是英文缩写,所以需要把缩写映射成汉语词汇。
  其中country.csv文件下载地址为 https://download.csdn.net/download/herosunly/11195628 。

country_df = pd.read_csv(r'country.csv')
jmr_attack_country_df = jmr_attack_country.merge(country_df, on='country')
jmr_attack_country_df.to_csv(r'jmr_attack_computer_country.csv', encoding='utf-8-sig', index = False)

7. 分类统计二

vulnerability_df = df_all[(df_all['安全类型'] == '安全漏洞')]
vulnerability_df['source'] = 0 #在上一次分类中是通过merge进行操作的。


vulnerability_df['失陷主机IP'] = vulnerability_df.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], True), axis = 1)
vulnerability_df['攻击主机IP'] = vulnerability_df.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], False), axis = 1)

geoip = GeoIP()
vulnerability_df['失陷主机IP国家'] = vulnerability_df['失陷主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
vulnerability_df['失陷主机IP城市'] = vulnerability_df['失陷主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
vulnerability_df['攻击主机IP国家'] = vulnerability_df['攻击主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
vulnerability_df['攻击主机IP城市'] = vulnerability_df['攻击主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
vulnerability_attack_country = vulnerability_df['攻击主机IP国家'].value_counts().to_frame().reset_index()
vulnerability_attack_country.columns=['country', 'times']

vulnerability_fall_computer_city = vulnerability_df[vulnerability_df['失陷主机IP国家'] == 'CN']['失陷主机IP城市'].value_counts().to_frame().reset_index()
vulnerability_fall_computer_city.columns = ['city', 'times']
import json
with open('city.json', encoding = 'utf-8') as json_data:
    city_dict = json.load(json_data)

vulnerability_fall_computer_city['city_name' ]= vulnerability_fall_computer_city['city'].map(lambda x: city_dict.get(x, 'unknown'))
vulnerability_fall_computer_city.to_csv('vulnerability_fall_computer_city.csv', encoding = 'utf-8-sig', index = False)
country_df = pd.read_csv(r'country.csv')
vulnerability_attack_country_df = vulnerability_attack_country.merge(country_df, on='country')
vulnerability_attack_country_df.to_csv(r'vulnerability_attack_computer_country.csv', encoding='utf-8-sig', index = False)

  后记:除了使用os.walk进行遍历以外,也可以使用glob函数进行遍历。该函数可以包含通配符,但它只适合某个文件夹下的文件,无法对文件夹下的子文件夹中的文件进行遍历。例如:

import glob
loop = glob.glob(r'c:\*.*')
for i in loop:
	print(i)

你可能感兴趣的:(Python批量处理excel文件)