注意代码中LONGITUDE、LATITUDE、SPEED、DIRECT等属于博主做交通数据处理时的残留模板。如要自定义使用替换为使用场景下的对应词句即可
import pandas as pd
import numpy as np
import matplotlib.pyplot as pit
import csv
import codecs
import openpyxl
def formulate(file_name):
df = pd.read_csv(file_name, header=None)
tt = ["ID_0", "ID_1", "ID_2", "EMPTY", "LONGITUDE", "LATITUDE", "SPEED", "DIRECT", "STATUS", "CPS_Y", "CPS_MON",
"CPS_D", "CPS_H", "CPS_MIN", "CPS_S", "CARTYPE"]
insertRow = pd.DataFrame([tt])
R2G = insertRow.append(df, ignore_index=True)
R2G.to_csv(file_name, header=None, index=None)
df = pd.read_csv(file_name, index_col=0)
def data_write_csv(file_name, datas):
file_csv = codecs.open(file_name, 'w+', 'utf-8')
writer = csv.writer(file_csv, delimiter=' ', quotechar=' ', quoting=csv.QUOTE_MINIMAL)
for data in datas:
writer.writerow(datas)
print("保存文件成功,处理结束")
def text_save(filename, data):
file = open(filename, 'a')
for i in range(len(data)):
s = str(data[i]).replace('[', '').replace(']', '')
s = s.replace("'", '').replace(',', '') + '\n'
file.write(s)
file.write(str(data))
file.close()
print("保存文件成功")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
filename = '输入需要预处理的数据所在的文件地址'
df = pd.read_csv(filename, header=None)
print("数据特征:(数据总数【行】, 属性个数【列】)")
print(df.shape)
print("数据结构:")
print(df.info())
print("数据详细描述:")
print(df.describe())
df.describe().to_csv('./output.csv')
map_point = df[['LONGITUDE', 'LATITUDE']]
map_point.to_csv('./output_points2.csv')
COL = df.columns.values
print(COL)
df.columns = [x.strip() for x in COL]
print(df.duplicated())
print("重复值数据量统计:")
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
df.index = range(df.shape[0])
sta = (df['DIRECT'] - df['DIRECT'].mean()) / df['DIRECT'].std()
print(sta)
text_save('./output.txt', sta)
df[sta.abs() > 3]
print(df[sta.abs() > 3])
print(df[sta.abs() > 3].index)
print(sta)
delindex = pd.concat([df[sta.abs() > 3]]).index
print("异常值index定位:")
errorIndex = df[sta.abs() > 3].index
print(errorIndex)
df.drop(errorIndex, inplace=True)
df.drop(errorIndex)
print("缺失值数据量:")
print(df.isnull().sum())
df.loc[df.SPEED.isnull(), 'SPEED'] = [str(x)[:6] for x in df.loc[df.SPEED1.isnull(), 'SPEED1']]
df.loc[df.AREA.isnull(), 'AREA'] = str(df.loc[df.AREA.isnull(), 'AREADETAIL'].values)[5:6]
df['SPEED'].fillna(round(df['SPEED'].mean(), 0), inplace=True)
print("[xxx, 0]时结果如下:")
print(round(df['SPEED'].mean(), 0))
print("[xxx, 1]时结果如下:")
print(round(df['SPEED'].mean(), 1))