其中xlrd是read_excel需要的。
import zipfile
import os
import pandas as pd
import os
import xlrd
import json
import pygeoip
由于给定的文件夹中文件鱼龙混杂,还有文件名中包括空格(这是多大的怨念才这么命名的啊),所以必须加上异常处理,然后通过后缀名去解压。
def unzip_folder_all_zip(src_root):
for rt, dirs, files in os.walk(src_root):
for single_file in files:
filename = single_file.split('.')[0]
suffix = single_file.split('.')[1]
try:
if suffix == 'zip':
single_file_path = os.path.join(rt, single_file)
print('zip single file path is {}'.format(single_file_path))
zip_ref = zipfile.ZipFile(single_file_path, 'r')
extract_path = rt + '\\'+ filename
zip_ref.extractall(extract_path)
zip_ref.close()
except Exception as e:
print(e)
给我的excel文件是以.csv为后缀名的。(无语ing)。
由于部分excel文件有问题,所以也需要加上异常处理。
最早的版本采用的是先读一个空文件,然后再进行concat操作(这种方式太麻烦)
def read_folder_all_excel(src_root):
df_all = pd.DataFrame()
for rt, dirs, files in os.walk(src_root):
for single_file in files:
suffix = single_file.split('.')[-1]
if suffix == 'csv':
single_file_path = os.path.join(rt, single_file)
print('single file path is {}'.format(single_file_path))
try:
single_df = pd.read_excel(single_file_path, sheetname=u'特征检测')
except Exception as e:
print(e)
df_all = df_all.append(single_df, ignore_index=True)
return df_all
由于读数据是逐个文件逐一读取、合并到大的DataFrame中。所以如果把DataFrame保存为pickle文件,在下次使用的时候就可以节约大量的时间。对于DataFrame来说,有很方便的API,具体如下所示:
df_all.to_pickle(r'df_all.pkl')
import pandas as pd
df_all = pd.read_pickle(r'df_all.pkl')
统计之后,需要写入文件,此处用的是csv文件,注意编码应为utf-8-sig,如果用utf-8保存,用excel打开就会是乱码。
def get_total_events_times(df_all, file_save_path):
total_events_times = df_all['事件名称'].value_counts() #得到的是Series类型数据
total_events_times_df = total_events_times.to_frame()
total_events_times_df.reset_index(inplace=True)
total_events_times_df.columns=['事件名称', '发生次数']
total_events_times_df.to_csv(file_save_path, index = False, encoding = 'utf_8_sig')
def get_category_events_times(df_all, file_save_path):
category_events_times = df_all.groupby('安全类型')['事件名称'].agg('value_counts')
category_events_times_df = category_events_times.to_frame()
category_events_times_df.columns = ['发生次数']
category_events_times_df.to_csv(file_save_path, encoding = 'utf_8_sig')
jmr_df = df_all[(df_all['安全类型'] == '蠕虫病毒') | (df_all['安全类型'] == '木马后门')]
滤除不包括webshell的事件,其中~表示不包含的意思。
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('ebshell')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('ebShell')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('php')]
jmr_df = jmr_df[~jmr_df['事件名称'].str.contains('扫描')]
如果对多列数据进行综合处理,可参考下列代码:
existed_event_df = pd.read_excel(r'fall_computer.xlsx')
jmr_df_merged = jmr_df.merge(existed_event_df, on = '事件名称')
def get_fall_computer_ip(source_ip, dest_ip, is_source, fall_flag = True):
if fall_flag == True: # 失陷主机
if is_source == 1:
return source_ip
else:
return dest_ip
else:# 攻击主机
if is_source == 1:#如果源是失陷主机
return dest_ip #则攻击主机就是目的IP
else:
return source_ip
jmr_df_merged['失陷主机IP'] = jmr_df_merged.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], True), axis = 1)
jmr_df_merged['攻击主机IP'] = jmr_df_merged.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], False), axis = 1)
当前路径需要有GeoLiteCity.dat文件,下载地址为 https://download.csdn.net/download/herosunly/11195594 。
class GeoIP(object):
def __init__(self, db="GeoLiteCity.dat"):
self.gi = pygeoip.GeoIP(db, pygeoip.MEMORY_CACHE)
def region(self, ip):
res = {}
info = self.gi.record_by_addr(ip)
if not info:
return res
res["country"] = info["country_code"]
res["city"] = info["city"]
return res
一个bug是字典没有使用get操作,有时候就会失败。
一个效率低的原因是,每一次生成GeoIP对象,速度就会很慢。所以先生成实例。
geoip = GeoIP()
jmr_df_merged['失陷主机IP国家'] = jmr_df_merged['失陷主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
jmr_df_merged['失陷主机IP城市'] = jmr_df_merged['失陷主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
jmr_df_merged['攻击主机IP国家'] = jmr_df_merged['攻击主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
jmr_df_merged['攻击主机IP城市'] = jmr_df_merged['攻击主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
攻击主机国家的相关代码:
jmr_attack_country = jmr_df_merged['攻击主机IP国家'].value_counts().to_frame().reset_index()
jmr_attack_country.columns=['country', 'times']
失陷主机城市的相关代码:
jmr_fall_computer_city = jmr_df_merged[jmr_df_merged['失陷主机IP国家'] == 'CN']['失陷主机IP城市'].value_counts().to_frame().reset_index()
jmr_fall_computer_city.columns = ['city', 'times']
由于失陷主机的地址对应的是中国的城市,但程序只给定的是拼音,所以需要把拼音映射成汉语词汇。
其中city.json文件下载地址为 https://download.csdn.net/download/herosunly/11195483 (文件是对的,文件名写错了)
import json
with open('city.json', encoding = 'utf-8') as json_data:
city_dict = json.load(json_data)
jmr_fall_computer_city['city_name' ]= jmr_fall_computer_city['city'].map(lambda x: city_dict.get(x, 'unknown'))
jmr_fall_computer_city.to_csv('jmr_fall_computer_city.csv', encoding = 'utf-8-sig', index = False)
#再根据常识进行简单的修改即可
由于攻击主机的地址对应的是中国的城市,但程序只给定的是英文缩写,所以需要把缩写映射成汉语词汇。
其中country.csv文件下载地址为 https://download.csdn.net/download/herosunly/11195628 。
country_df = pd.read_csv(r'country.csv')
jmr_attack_country_df = jmr_attack_country.merge(country_df, on='country')
jmr_attack_country_df.to_csv(r'jmr_attack_computer_country.csv', encoding='utf-8-sig', index = False)
vulnerability_df = df_all[(df_all['安全类型'] == '安全漏洞')]
vulnerability_df['source'] = 0 #在上一次分类中是通过merge进行操作的。
vulnerability_df['失陷主机IP'] = vulnerability_df.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], True), axis = 1)
vulnerability_df['攻击主机IP'] = vulnerability_df.apply(lambda row: get_fall_computer_ip(row['源IP地址'], row['目的IP地址'], row['source'], False), axis = 1)
geoip = GeoIP()
vulnerability_df['失陷主机IP国家'] = vulnerability_df['失陷主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
vulnerability_df['失陷主机IP城市'] = vulnerability_df['失陷主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
vulnerability_df['攻击主机IP国家'] = vulnerability_df['攻击主机IP'].apply(lambda x: geoip.region(x).get('country', 'empty'))
vulnerability_df['攻击主机IP城市'] = vulnerability_df['攻击主机IP'].apply(lambda x: geoip.region(x).get('city', 'empty'))
vulnerability_attack_country = vulnerability_df['攻击主机IP国家'].value_counts().to_frame().reset_index()
vulnerability_attack_country.columns=['country', 'times']
vulnerability_fall_computer_city = vulnerability_df[vulnerability_df['失陷主机IP国家'] == 'CN']['失陷主机IP城市'].value_counts().to_frame().reset_index()
vulnerability_fall_computer_city.columns = ['city', 'times']
import json
with open('city.json', encoding = 'utf-8') as json_data:
city_dict = json.load(json_data)
vulnerability_fall_computer_city['city_name' ]= vulnerability_fall_computer_city['city'].map(lambda x: city_dict.get(x, 'unknown'))
vulnerability_fall_computer_city.to_csv('vulnerability_fall_computer_city.csv', encoding = 'utf-8-sig', index = False)
country_df = pd.read_csv(r'country.csv')
vulnerability_attack_country_df = vulnerability_attack_country.merge(country_df, on='country')
vulnerability_attack_country_df.to_csv(r'vulnerability_attack_computer_country.csv', encoding='utf-8-sig', index = False)
后记:除了使用os.walk进行遍历以外,也可以使用glob函数进行遍历。该函数可以包含通配符,但它只适合某个文件夹下的文件,无法对文件夹下的子文件夹中的文件进行遍历。例如:
import glob
loop = glob.glob(r'c:\*.*')
for i in loop:
print(i)