原创文章:如有下载及转载请注明来源链接,否则视为侵权
(第一)------------------数据读取相关方法--------------
import numpy as py
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn import RandomForestclassifier
loandData=pd.read_csv("")
Pandas将多个Sheet写入到本地同一Excel文件中
import pandas as pd #读取两个表格data1=pd.read_excel('文件路径')
data2=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系统入账金额异常结果数据.xlsx')#将两个表格输出到一个excel文件里面
data1=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系统入账金额.xlsx')#将两个表格输出到一个excel文件里面
writer=pd.ExcelWriter('D:新表.xlsx')
data1.to_excel(writer,sheet_name='sheet1')
data2.to_excel(writer,sheet_name='sheet2') #必须运行
writer.save()#不然不能输出到本地writer.save()
------Pandas的read_csv读入数据并且自己给列名命名--------
3. 使用Pandas的read_csv、read_fwf、read_table读取数据
import pandas as pd#数据框dataframe
csv_data=pd.read_csv('D:\Python数据分析与数据化运营\python_book\chapter2\csv_data.csv',names=['aa','bb','cc','dd','hh'])#读入数据并且自己给列名命名
print(csv_data)
--------Pandas的pd.read_fwf读入数据并且自己给列名命名----------
import pandas as pd # 导入Pandas库数据框dataframe
fwf_data = pd.read_fwf('D:\Python数据分析与数据化运营\python_book\chapter2\fwf_data', widths=[5, 5, 5, 5], names=['col1', 'col2', 'col3', 'col4']) # 读取csv数据
print (fwf_data) # 打印输出数据
----------Pandas的pd.read_table(txt)读入数据并且自己给列名命名---------
import pandas as pd#数据框dataframe
table_data=pd.read_table('D:\Python数据分析与数据化运营\python_book\chapter2\table_data.txt',sep=';',
names=['aa','ab','ac','ad','ah'])
print(table_data)
------------numpy的读入(txt)---------------------
import numpy as np#array数据组形式
file_name='D:\Python数据分析与数据化运营\python_book\chapter2\numpy_data.txt'
data=np.loadtxt(file_name,dtype='float32',delimiter=' ')
print(data)
------------numpy的读入(npy)----------------
import numpy as np#读入npy格式
write_data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
np.save('D:\Python数据分析与数据化运营\python_book\chapter2\load_data1',write_data)
read_data=np.load('D:\Python数据分析与数据化运营\python_book\chapter2\load_data1.npy')
1. 使用read、readline、readlines读取数据
通过read方式读入text.txt数据
file_name='D:\Python数据分析与数据化运营\python_book\chapter2\text.txt'
file_object=open(file_name)
read_data=file_object.read()
print(read_data)
通过readline方式读入text.txt数据
file_object=open(file_name)
readline_data=file_object.readline()
print(readline_data)
fn = open('D:\Python数据分析与数据化运营\python_book\chapter2\text.txt') # 获得文件对象
print (fn.tell()) # 输出指针位置
line1 = fn.readline() # 获得文件第一行数据
print (line1) # 输出第一行数据
print (fn.tell()) # 输出指针位置
line2 = fn.readline() # 获得文件第二行数据
print (line2) # 输出第二行数据
print (fn.tell()) # 输出指针位置
fn.close() # 关闭文件对象
(第二)------------数据预览相关方法---------------
loandData.head()#查看前5行记录
loandData.info()#查看每个变量结构基本情况
loandData.shape()#查看总样本数及总字段数量
loandData.count()#查看每个变量对应的样本数
loandData.describe()#查看变量最基本指标情况(包括最大值,最小值,方差,第一分位数,中位数数,第三分位数,均值,计数)
df.head(5) #查看DataFrame对象的前n行
df.tail(5) #查看DataFrame对象的最后n行
df.shape #查看数据的行列数各是多少
df.info() #查看索引、数据类型和内存信息
df.describe(percentiles=[.05, .25, .75, .95]) #查看数值型列的汇总统计,返回计数、均值、标准差、最小最大值、25%50%75%分位数,percentiles0.05,0.95分位数
df.unique() #快速查看数据列有哪些分类内容,类似groupby
s.value_counts(dropna=False) #查看Series对象的唯一值和计数
df.apply(pd.Series.value_counts) #查看DataFrame对象中每一列的唯一值和计数
df.sum() #返回所有列的求和值
df.mean() #返回所有列的均值
df.corr() #返回列与列之间的相关系数
df.count() #返回每一列中的非空值的个数
df.max() #返回每一列的最大值
df.min #返回每一列的最小值
df.median() #返回每一列的中位数
df.std() #返回每一列的标准差
df.loc[:3,['incall']]#基于incall列,选择特定行的数据,返回DataFrame格式
df[['incall']]#选择基于incall列,返回DataFrame格式
print(df['incall'])
import pandas as pd
data=pd.read_csv(r'/Users/huangjunwen/Desktop/fandango_score_comparison.csv')
这是我的路径大家使用的时候记得改,记得改,记得改!
print(data.head(10))#查看前10行数据,数据大的时候可以通过该方式快速看到df的数据格式
loanData=loanData[loanData['Status']!='Cancelled']#间接删除某个字段中某一类数据
--增加一列(CreditScore)变量=((最高信用+最低信用)/2).round(0)
loanData['CreditScore']=((loanData.CreditScoreRangeUpper+loanData.CreditScoreRangeLower)/2).round(0)#.round(0)表示取四舍五入
1:----------------------批量数据转换-------------------------------------
def loan_status(s):#该函数主要功能把状态最终分为4种情况包括坏账,取消,贷款还款中,正常还款(正常完成,最后还款中,逾期还款)
if s=='Chargedoff':#Chargedoff(冲销,投资人有损失)
a='Defaulted' #Defaulted(坏账,投资人有损失)
elif s=='Defaulted':#Defaulted(坏账,投资人有损失)
a = 'Defaulted'#Defaulted(坏账,投资人有损失)
elif s=='Cancelled':#Cancelled(取消)
a='Cancelled' #Cancelled(取消)
elif s == 'Current':#Current(贷款还款中)
a = 'Current'#Current(贷款还款中)
else:
a='Completed'#Completed(正常完成,投资人无损失)
return a
loanData['Status']=loanData['LoanStatus'].apply(loan_status)#将数据进行转换,增加一列为Status变量:LoanStatus数据字段转换为Status(来自loan_status函数转换了贷款状态)
------------------------------过滤筛选-----------------------------
DefaultedRatio=loanData[loanData['Status']=='Defaulted']['DebtToIncomeRatio']
CompletedRatio=loanData[loanData['Status']=='Completed']['DebtToIncomeRatio']
2:---------------------批量数据转换--先求四分位数及中位数,再转换-----------------
loanData['BankcardUtilization']的四位数是0.31
oneFourth=loanData['BankcardUtilization'].quantile(0.25)
loanData['BankcardUtilization']的中位数是0.6
twoForth=loanData['BankcardUtilization'].quantile(0.5)
def bank_card_use(s,oneForth = 0.31,twoForth = 0.6):#根据业务经验认为设置两个阀值
if s<=oneForth:
b='Mild Use'
elif (s>oneForth) & (s<=twoForth):
b='Medium Use'
elif (s>twoForth) & (s<=1):
b='Heavy Use'
elif s>1:
b='Super Use'
else:
b='No Use'
return b
loanData['BankCardUse']=loanData['BankcardUtilization'].apply(bank_card_use)
3:---------------------------缺失值处理相关---------------------------
填充缺失值
x['age'].fillna(x['age'].mean(), inplace=True)
2.通常情况下删除行,使用参数axis = 0,删除列的参数axis = 1,通常不会这么做,那样会删除一个变量。
print('\ndrop row')
print(df.dropna(axis = 0))
删除后结果:
缺失值处理
import pandas as pd # 导入pandas库
import numpy as np # 导入numpy库
from sklearn.preprocessing import Imputer # 导入sklearn.preprocessing中的Imputer库
生成缺失数据
df = pd.DataFrame(np.random.randn(6, 4), columns=['col1', 'col2', 'col3', 'col4']) # 生成一份数据
df.iloc[1:2, 1] = np.nan # 增加缺失值
df.iloc[4, 3] = np.nan # 增加缺失值
print (df)
查看哪些值缺失
nan_all = df.isnull() # 获得所有数据框中的N值
print (nan_all) # 打印输出
查看哪些列缺失
nan_col1 = df.isnull().any() # 获得含有NA的列
nan_col2 = df.isnull().all() # 获得全部为NA的列
print (nan_col1) # 打印输出
print (nan_col2) # 打印输出
丢弃缺失值
df2 = df.dropna() # 直接丢弃含有NA的行记录
print (df2) # 打印输出
-------每个字段是否缺失并且展示对应的行数----
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1)
--------用中位数------------补全缺失值
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
使用sklearn将缺失值替换为特定值
nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0) # 建立替换规则:将值为Nan的缺失值以均值做替换
nan_result = nan_model.fit_transform(df) # 应用模型规则
print (nan_result) # 打印输出
使用pandas将缺失值替换为特定值
nan_result_pd1 = df.fillna(method='backfill') # 用后面的值替换缺失值
nan_result_pd2 = df.fillna(method='bfill', limit=1) # 用后面的值替代缺失值,限制每列只能替代一个缺失值
nan_result_pd3 = df.fillna(method='pad') # 用前面的值替换缺失值
nan_result_pd4 = df.fillna(0) # 用0替换缺失值
nan_result_pd5 = df.fillna({'col2': 1.1, 'col4': 1.2}) # 用指定值替换不同列的缺失值
nan_result_pd6 = df.fillna(df.mean()['col2':'col4']) # 用平均数代替,选择各自列的均值替换缺失值
打印输出
print (nan_result_pd1) # 打印输出
print (nan_result_pd2) # 打印输出
print (nan_result_pd3) # 打印输出
print (nan_result_pd4) # 打印输出
print (nan_result_pd5) # 打印输出
print (nan_result_pd6) # 打印输出
主要变量的缺失值,只显示存在缺失的变量,获取其缺失数量,以及缺失率,代码实现如下: 4.1:######用中位数替换CreditScore的缺失值############ 用“NOTA” 替换BorrowerState的缺失值 4.2:############DebtToIncomeRatio 添加随机数缺失值处理######### def rand_missing(s): loanData['DebtToIncomeRatio']=loanData['DebtToIncomeRatio'].apply(rand_missing) 4.3:##############将DelinquenciesLast7Years的缺失值赋值为1################ 4.4###########2009之后,选出ProsperRating (Alpha)为空的行,然后对行进行删除##### 4.5#字符串变量转换成数字变量 def harmonize_data(df): pandas以类似字典的方式来获取某一列的值,比如df[‘A’],这会得到df的A列。如果我们对某一行感兴趣呢?这个时候有两种方法,一种是iloc方法,另一种方法是loc方法。loc是指location的意思,iloc中的i是指integer。这两者的区别如下: 4.6#######################通过循环计算准确率 result=rfr.predict(X_test) pecent=accuracy_statistics(list(Y_test.values),list(result)) 该模型预测结果的准确率为(1498+4520)/8385=71.77%。 5##########如何安装fbprophet库 如何安装fbprophet库 python——修改Dataframe列名的两种方法 df.columns = ['A','B'] 2、只修改列名a为A Python中dataframe\ array\ list相互转化 2、array转化成dataframe 3、把Pandas中的dataframe转成numpy中的array 使用 df=df.values, print(len(X))--查询数组个数 python 切片无循序取其中某列 ---python left join ------------------------------安装分词库jieba------------------------------------------------------------------------------------------------- stopwords = stopwordslist('D:\2019年python代码\python_book\chapter4\stopwords.txt') # 这里加载停用词的路径 ---------------------------------------在python中如何将两个list合并成一个list---------------------------------------------------------------- --------------------------------pd.set_option('display.max_rows', 4)#限定只显示四行-------------------------------------------------------------------------- -------------------------------------------删除NA缺失数据--------------------------------------------------------------------------------------------------- 查看数据内容: ----------------------------------------把data数据中的y列的'yes'值改为1,'no'值改为0---------------------------------------------------------------------------- data.loc[data['y']=='yes','y']=1#把data数据中的y列的'yes'值改为1 -----------------------------------------------------取两表差集合(相当去重)------------------------------------------------------------------- --方法1取两表差集合(相当去重) ddd=df1.merge(df2,how='left',left_on='id',right_on='id')#用mergy方法找出两表 --方法2取两表差集合(相当去重) df_mean=df[['col1','col2']].mean(axis=0)#数据框dataframe求均值 df_mean=df[['col1','col2']].mean(axis=1)#数据框dataframe求均值 df_std=df[['col1','col2']].std(axis=0)#数据框dataframe求均值 df_std=df[['col1','col2']].std(axis=1)#数据框dataframe求均值 df_var=df[['col1','col2']].var(axis=0)#数据框dataframe求均值 df_var=df[['col1','col2']].var(axis=1)#数据框dataframe求均值 df_max=df[['col1','col2']].max(axis=0)#数据框dataframe求均值 df_max=df[['col1','col2']].max(axis=1)#数据框dataframe求均值 df_min=df[['col1','col2']].min(axis=0)#数据框dataframe求均值 df_min=df[['col1','col2']].min(axis=1)#数据框dataframe求均值 df_sum=df[['col1','col2']].sum(axis=0)#数据框dataframe求均值 df_sum=df[['col1','col2']].sum(axis=1)#数据框dataframe求均值 df_count=df[['col1','col2']].count(axis=0)#数据框dataframe求均值 df_count=df[['col1','col2']].count(axis=1)#数据框dataframe求均值 df_median=df[['col1','col2']].median(axis=1)#数据框dataframe求均值 import pandas as pd # 导入pandas库 data1 = ['a', 3] isDuplicated = df.duplicated() # 判断重复数据记录 new_df1 = df.drop_duplicates() # 删除数据记录中所有列值相同的记录 print ('It 's a dog!') print (r'\\t\') import datetime students = ["ming", "hua", "li", "juan", "yun", 3] print ('[4]=', students[4]) students[3] = "小月" students[5]="小楠" students[5]=19978 students.append('han') # 只能添加一个添加到尾部 scores = [90, 80, 75, 66] print (students.pop(1)) # 该函数返回被弹出的元素,不传入参数则删除最后一个元素 print( 'wan' in students) print (range(10)) studentsTuple = ("ming", "jun", "qiang", "wu", scores) try: students = ["ming", "hua", "li", "juan", "yun", 3] x = a & b y = a | b z = a - b new = set(a) k = {"name":"weiwei", "home":"guilin"} k["like"] = "music" print ('name' in k) if 'name' in k: k.get('edu', -1) # 通过dict提供的get方法,如果key不存在,可以返回None,或者自己指定的value k.pop('like') salary = 30000 heights = {'Yao':226, 'Sharq':216, 'AI':183} for key, value in heights.items(): total = 0 -------------------------------------------------------while 循环简单操作------------------------------------------------------------- a = 1 ---------------------------------------------- break、continue和pass----------------------------------------------------------------- for i in range(1, 5): for i in range(1, 5): myList = [-1, 2, -3, 4, -5, 6, 7] sample=pd.DataFrame({'name':['Bob','Lindy','Mark','Miki','Sully','Rose'], sample sample.sort_values('score',ascending=False,na_position='last') sample.sort_values('score',ascending=True,na_position='last') sample.sort_values(['group','score']) sample.sort_values(['group','score']) -------------------------------------------------------数据框分组汇总-------------------------------------- sample = pd.read_csv(r'D:\《Python数据科学:技术详解与商业实践》源代码文件\Python_book\5Preprocessing\sample.csv', encoding='gbk') sample.groupby('class')[['math']].min() sample.groupby(['grade','class'])[['math']].mean()#两个字段分组 sample.groupby(['grade'])['math','chinese'].mean() sample.groupby('class')['math'].agg(['mean','min','max']) df = sample.groupby(['grade','class'])['math','chinese'].agg(['min','max']) table = pd.DataFrame({'cust_id':[10001,10001,10002,10002,10003], pd.pivot_table(table,index='cust_id',columns='type',values='Monetary') pd.pivot_table(table,index='cust_id',columns='type',values='Monetary', table1 = pd.pivot_table(table,index='cust_id', pd.melt(table1, df1 = pd.DataFrame({'id':[1,2,3], df1.merge(df2,how='inner',left_on='id',right_on='id') df1.merge(df2,how='left',left_on='id',right_on='id') df1.merge(df2,how='right',on='id') df1 = pd.DataFrame({'id1':[1,2,3], pd.concat([df1,df2],axis=1) df1 = pd.DataFrame({'id':[1,1,1,2,3,4,6], pd.concat([df1,df2],ignore_index=True,axis=0) pd.concat([df1,df2],ignore_index=True).drop_duplicates()#纵向合并并且去重 pd.concat([df1,df3],ignore_index=True).drop_duplicates() ----------------------------------------------------------------数据整合-------------------------------------- import pandas as pd sample.ix[0:2, 0:2] sample['new_col1'] = sample['a'] - sample['b'] sample_new=sample.assign(new_col2 = sample['a'] - sample['b'], sample.drop('a',axis=1)#数据框删除某一列 sample =pd.DataFrame({'name':['Bob','Lindy','Mark', sample[sample.score > 70] sample[(sample.score > 70) & (sample.group ==1)] sample.query('(group ==2) |(group == 1)') sample[sample['score'].between(70,80,inclusive=True)] sample[sample['name'].isin(['Bob','Lindy'])] sample = pd.DataFrame({'name':['Bob','Lindy','Mark', sample.score.replace(999,np.nan) sample.replace({'score':{999:np.nan}, def transform(row): sample.assign(class_n = sample.apply(transform,axis=1)) sample = sample.copy()
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1) #查出每个变量是否缺失?每个变量总的观测值
column=['是否缺失','数量']#定义两列
missing1=pd.DataFrame(list(missing.values),index=list(missing.index),columns=column)#修正新列命 ‘是否缺失','数量'
max=missing1['数量'].max()#每个变量最大观测值
missing1['缺失数量']=max-missing1['数量']#每个变量最大观测值-每个变量实际观测值=每个变量缺失数量
missing1['缺失率']=missing1['缺失数量']/max#求出缺失值
miss=missing1[missing1['数量']
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
loanData['BorrowerState']=loanData['BorrowerState'].fillna('NOTA')DebtToIncomeRatio缺失值添加随机数
if s>=0:
a=s
else:
a=random.uniform(0.1,0.5)
return aDebtToIncomeRatio的缺失值添加0.1~0.5的随机变量
loanData['DelinquenciesLast7Years'] = loanData['DelinquenciesLast7Years'].fillna(1)
missIndex=loanData[(loanData['ProsperRating (Alpha)'].isnull()) & (loanData['DatePhase']=='After Jul.2009')]
loanData=loanData.drop(missIndex.index,axis=0)数据中存在字符串变量,将其用数字变量进行替换。实现的函数如下:
定性变量的赋值
# 填充空数据 和 把string数据转成integer表示
#Status
df.loc[df['Status']=='Completed','Status']=1
df.loc[df['Status'] == 'Defaulted', 'Status'] = 0
df.loc[df['Status'] == 'Current', 'Status'] = 2
#IsBorrowerHomeowner
df.loc[df['IsBorrowerHomeowner'] == False, 'IsBorrowerHomeowner'] = 0
df.loc[df['IsBorrowerHomeowner'] == True, 'IsBorrowerHomeowner'] = 1
#CreditGrade
df.loc[df['CreditGrade'] == 'NC', 'CreditGrade'] = 0
df.loc[df['CreditGrade'] == 'HR', 'CreditGrade'] = 1
df.loc[df['CreditGrade'] == 'E', 'CreditGrade'] = 2
df.loc[df['CreditGrade'] == 'D', 'CreditGrade'] = 3
df.loc[df['CreditGrade'] == 'C', 'CreditGrade'] = 4
df.loc[df['CreditGrade'] == 'B', 'CreditGrade'] = 5
df.loc[df['CreditGrade'] == 'A', 'CreditGrade'] = 6
df.loc[df['CreditGrade'] == 'AA', 'CreditGrade'] = 7
#ProsperRating (Alpha)
df.loc[df['ProsperRating (Alpha)'] == 'HR', 'ProsperRating (Alpha)'] = 1
df.loc[df['ProsperRating (Alpha)'] == 'E', 'ProsperRating (Alpha)'] = 2
df.loc[df['ProsperRating (Alpha)'] == 'D', 'ProsperRating (Alpha)'] = 3
df.loc[df['ProsperRating (Alpha)'] == 'C', 'ProsperRating (Alpha)'] = 4
df.loc[df['ProsperRating (Alpha)'] == 'B', 'ProsperRating (Alpha)'] = 5
df.loc[df['ProsperRating (Alpha)'] == 'A', 'ProsperRating (Alpha)'] = 6
df.loc[df['ProsperRating (Alpha)'] == 'AA', 'ProsperRating (Alpha)'] = 7
#IncomeRange
df.loc[df['IncomeRange'] == 'Not displayed', 'IncomeRange'] = 0
df.loc[df['IncomeRange'] == 'Not employed', 'IncomeRange'] = 1
df.loc[df['IncomeRange'] == '1-24,999', 'IncomeRange'] = 3
df.loc[df['IncomeRange'] == '50,000-74,999', 'IncomeRange'] = 5
df.loc[df['IncomeRange'] == '100,000+', 'IncomeRange'] = 7
#BankCardUse
df.loc[df['BankCardUse'] == 'No Use', 'BankCardUse'] = 0
df.loc[df['BankCardUse'] == 'Mild Use', 'BankCardUse'] = 1
df.loc[df['BankCardUse'] == 'Medium Use', 'BankCardUse'] = 2
df.loc[df['BankCardUse'] == 'Heavy Use', 'BankCardUse'] = 3
df.loc[df['BankCardUse'] == 'Super Use', 'BankCardUse'] = 4
#CustomerClarify
df.loc[df['CustomerClarify'] == 'New Borrower', 'CustomerClarify'] = 0
df.loc[df['CustomerClarify'] == 'Previous Borrower', 'CustomerClarify'] = 1
return df#字符串替换成整数
loanData=harmonize_data(loanData)
loc和iloc的区别
loc:works on labels in the index.
iloc:works on the positions in the index (so it only takes integers).
也就是说loc是根据index来索引,比如下边的df定义了一个index,那么loc就根据这个index来索引对应的行。iloc并不是根据index来索引,而是根据行号来索引,行号从0开始,逐次加1。##
Y_test
对预测的准确率进行计算:
def accuracy_statistics(rd,prd):
count=len(prd)
sum=0
for i in range(1,count):
if rd[i]==prd[i]:
sum += 1
pecent=round(sum/count,4)
return pecent
https://www.jianshu.com/p/0c06ad7bccaa
1:查看安装了那些包 pip list
2: 查看那些包需要更新版本pip list --outdated
3:创建一个新的python环境:conda create -n fbprophet_python3.7 python=3.7
4:激活已创建新的python环境:conda activate fbprophet_python3.7
1、创建一个新的环境按照官网的做法,创建一个新的python环境
conda create -n fbprophet_python3.7 python=3.7
2.激活已创建新的python环境:conda activate fbprophet_python3.7
3.安装C++的编译工具
4.conda install libpython m2w64-toolchain -c msys2
5、安装依赖库
conda install numpy cython -c conda-forge
conda install matplotlib scipy pandas -c conda-forge
6.安装pystan
conda install pystan -c conda-forge
7.安装fbprophet
conda install fbprophet -c conda-forge
8.还是有点问题的。
这样吧在GitHub上,下载一个压缩包,用压缩包安装
https://github.com/facebook/prophet
conda activate fbprophet_python3.7
我又做了一次挣扎,fbprophet也用conda安装
conda install -c conda-forge fbprophet(第六)##############
首先新建一个Dataframe
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
a b
0 1 1
1 2 2
2 3 3
1、修改列名a,b为A、B。df.rename(columns={'a':'A'})
1、list 转化成array矩阵 np.array(result).T
pd.DataFrame(result)
type(X)--查询数据类型
rr111=rr11[['kelian_x','qingjiatianshu_x','meancall_x','meanlong_x','meanxiapi_x','聚类类别']]
--------python 切片有循序取其中某列
rr111=rr11.iloc[:,0:6]
rr11=pd.merge(left=rr1, right=r0, how='left', left_on=rr1.index, right_on=r0.index)# 关联取出类别数目
1:打开Anaconda Prompt
2: pip install jieba
----------------------------------------本地读入list格式数据-------------------------------------------------------------------------------------
import jieba
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords对句子进行分词
1:用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部,例如:
2:用切片(slice)操作,L1[len(L1):len(L1)] = L2和上面的方法等价,例如:
但切片方法用起来更灵活,可以插入到头部,或其他任意部位,例如:
加到开头:
3:加到中间:
-----------------------------------------python数组读入数据-----------------------------------------------------------------------------------
data=np.loadtxt('.txt')
pd.set_option('display.max_rows', 4)#限定只显示四行
data=data.dropna()#删除带有NA的所有行与列
print(data.shape)#查看总样本数及总字段数
print(list(data.columns))#查看所有列
----------------------------------------------查前面五行数据或者最后五行数据数据------------------------------------------------------------------------------
data1.head(5)#前面五行数据
data1.tail(5)#最后五行数据
--------------------------------------------------------去重unique---------------------------------------------------------------------------------------------
data['education'].unique()#去重把y变为数值型,并进行简单的统计。
data.loc[data['y']=='no','y']=0#把data数据中的y列的'no'值改为0
data['y'].value_counts()#分类对应的个数
----------------------------------------------------画直方图------------------------------------------------------------------------------------------------------
sns.countplot(x='y',data=data,palette='hls')#画直方图
--------------------------------------------------------groupby----------------------------------------------------------------------------------------------------
data.groupby('y').mean()
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})#### 1. 数据框内连接,类似inner join
ddd1=ddd[ddd.isnull().T.any()][['id','col1']]#取两表差集(相当去重)
d1=df1['id'].values.tolist()
d2=df2['id'].values.tolist()
data=[]
for i in d1:
if i in d2:
data=data
else:
data.append(i)
df3=df1[df1['id'].isin(data)]
----------------------------------------------dataframe求均值-------------------------------------------------------------dataframe按列求均值
print(df_mean)dataframe按行求均值
print(df_mean)
-------------------------------------------------dataframe求svd------------------------------------------------------------dataframe按列求std
print(df_std)dataframe按行求std
print(df_std)
---------------------------------------------#dataframe求var(标准差)-------------------------------------------------------dataframe按列求var(标准差)
print(df_var)dataframe按行求var(标准差)
print(df_var)
--------------------------------------------#dataframe按列求max-----------------------------------------------------------dataframe按列求max
print(df_max)dataframe按行求max
print(df_max)
----------------------------------------------#dataframe求min-------------------------------------------------------------dataframe按列求min
print(df_min)dataframe按行求min
print(df_min)
-------------------------------------------------#dataframe求sum---------------------------------------------------------dataframe按列求sum
print(df_sum)dataframe按行求sum
print(df_sum)
---------------------------------------------------#dataframe求count------------------------------------------------------------dataframe按列求count
print(df_count)dataframe按行求count
print(df_count)
------------------------------------------------#dataframe求中位数--------------------------------------------------------------
df_median=df[['col1','col2']].median(axis=0)#数据框dataframe求均值
print(df_median)dataframe按行求median
print(df_median)
-------------------------------------------------dataframe按describe求常规统计量,不能用axis=1或者axis=2------------------------------
df_describe=df[['col1','col2']].describe()#数据框dataframe求均值
print(df_describe)
------------------------------------------------#dataframe按corr()求相关系数,不能用axis=1或者axis=2-----------------------------------
df_corr=df.corr()#数据框dataframe求均值
print(df_corr)
----------------------------------------------------dataframe判断重复数据------------------------------------------------------------重复值处理
生成重复数据
data2 = ['b', 2]
data3 = ['a', 3]
data4 = ['c', 2]
df = pd.DataFrame([data1, data2, data3, data4], columns=['col1', 'col2'])
print (df)判断重复数据
print (isDuplicated) # 打印输出
-------------------------------------------------------dataframe删除重复值------------------------------------------------------------删除重复值
new_df2 = df.drop_duplicates(['col1']) # 删除数据记录中col1值相同的记录
new_df3 = df.drop_duplicates(['col2']) # 删除数据记录中col2值相同的记录
new_df4 = df.drop_duplicates(['col1', 'col2']) # 删除数据记录中指定列(col1/col2)值相同的记录
print (new_df1) # 打印输出
print (new_df2) # 打印输出
print (new_df3) # 打印输出
print (new_df4) # 打印输出
--------------------------------------------------------转义字符--------------------------------------------------------------------- 转义字符''
转义字符\可以转义很多字符,比如\n表示换行,\t表示制表符,字符\本身也要转义,所以\表示的字符就是\
print ("hello world!\nhello Python!")
print ('\\t\')原样输出引号内字符串可以使用在引号前加r
-------------------------------------------------------日期天数相减------------------------------------------------------------------
someDay = datetime.date(1999,2,10)
anotherDay = datetime.date(1999,2,15)
deltaDay = anotherDay - someDay
deltaDay.days
----------------------------------------------------------列表(list)简单操作---------------------------------------------------------列表(list)用来存储一连串元素的容器,列表用[]来表示,其中元素的类型可不相同。
print (students)
type(students)列表索引和切片
索引从0开始,含左不含右
print ('[-4]=', students[-4])
print ('[0:4]=', students[0:4])
print( '[4:]=', students[4:])
print ('[0:4:2]=', students[0:4:2])
print ('[-5:-1:]=', students[-5:-1:])
print ('[-2::-1]=', students[-2::-1])修改列表
print (students[3])
print (students[5])
print (students[5])插入元素
students.extend(['long', 'wan'])#可以添加多个到尾部
print (students)
students.insert(6, scores) # 添加到指定左边第一位置,不会去重
students删除元素
print (students)判断元素是否在列表中等
print ('han' not in students)
students.count('wan')#计算个数
students.index('wan')#向量在哪个位置range函数生成整数列表
print (range(-5, 5))
print (range(-10, 10, 2))
print (range(16, 10, -1))
---------------------------------------------元组(tuple)简单操作--------------------------------------------------------------------### 3.2.2 元组(tuple)
元组类似列表,元组里面的元素也是进行索引计算。列表里面的元素的值可以修改,而元组里面的元素的值不能修改,只能读取。元组的符号是()。
studentsTuple
studentsTuple=['aaa','dddd']
studentsTuple1=('aaa','dddd')
print(studentsTuple)
type(studentsTuple1)判断异常
studentsTuple[1] = 'fu'
except TypeError:
print ('TypeError')
scores[1]= 100
studentsTuple
'ming' in studentsTuple
studentsTuple[0:4]
studentsTuple.count('ming')
studentsTuple.index('jun')
len(studentsTuple)
-------------------------------------------集合(set)简单操作-------------------------------------------------------------------------### 3.2.3 集合(set),去重功能
Python中集合主要有两个功能,一个功能是进行集合操作,另一个功能是消除重复元素。 集合的格式是:set(),其中()内可以是列表、字典或字符串,因为字符串是以列表的形式存储的
studentsSet = set(students)
print (studentsSet)
studentsSet.add('xu')#集合增加一个字符
print (studentsSet)
studentsSet.remove('xu')#集合减掉一个字符
print (studentsSet)
a = set("abcnmaaaaggsng")
print ('a=', a)
b = set("cdfm")
print ('b=', b)交集
print( 'x=', x)并集
print ('y=', y)差集
print( 'z=', z)去除重复元素
print( z)
------------------------------------------------------------字典(dict)简单操作--------------------------------------### 3.2.4字典(dict)
Python中的字典dict也叫做关联数组,用大括号{}括起来,在其他语言中也称为map,使用键-值(key-value)存储,具有极快的查找速度,其中key不能重复。
print (k["home"])
print( k.keys())#键
print( k.values())#值
a={"success":True,"reason_code":"200","reason_desc":"获取成功",
"rules":[{"rule_id":"1062274","score":7,"conditions":[{"address_a_value":
"南通市","address_a":"mobile_address","address_b":"true_ip_address","address_b_value":"南京市","type":"match_address"}]}]}
print(a["success"])添加、修改字典里面的项目
k['name'] = 'guangzhou'
print (k)判断key是否存在
has_key方法在python2中是可以使用的,在python3中删除了。
print (k.has_key('name'))
改为:
print("Yes")删除key-value元素
print (k)
--------------------------------------------------------if语句-----------------------------------------------------------------------### 3.3.2 分支结构:Python中if语句是用来判断选择执行哪个语句块的
=============================================================================
if
执行语句块
elif
执行语句块
else: # 都不满足
执行语句块
=============================================================================
elif子句可以有多条,elif和else部分可省略
if salary > 10000:
print ("Wow!!!!!!!")
elif salary > 5000:
print ("That's OK.")
elif salary > 3000:
print ("5555555555")
else:
print ("..........")
-----------------------------------------------for 循环------------------------------------------------------------------------------- for 循环
=============================================================================
for (条件变量) in (集合):
执行语句块
=============================================================================
“集合”并不单指set,而是“形似”集合的列表、元组、字典、数组都可以进行循环
条件变量可以有多个
for i in heights:
print (i, heights[i])for key, value in heights.items():-Python3 不能使用dict.iteritems(),改为dict.items()
print(key, value)
for i in range(1, 101):
total += i
print (total)### 3.3.3 循环结构
while 循环
=============================================================================
while
循环执行语句块
else: # 不满足条件
执行语句块
=============================================================================
else部分可以省略
while a < 10:
if a <= 5:
print (a)
else:
print ("Hello")
a = a + 1
else:
print ("Done")*** 练习:使用循环和分支结构输出20以内的奇数
### 3.3.4 break、continue和pass
break:跳出循环
continue:跳出当前循环
pass:占位符,什么也不做
if i == 3:
break
print (i)
if i == 3:
continue
print (i)
for i in range(1, 5):
if i == 3:
pass
print (i)
---------------------------------------------------------排序相关--------------------------------------------------------------------元组排序
print(sorted(myList))#升序排序
print(sorted(myList,reverse=True))#降序排序### 5.1.5 数据框排序
#### 1. 排序
'score':[98,78,87,77,77,np.nan],
'group':[1,1,1,2,1,2],})按'score'这个字段降序排序
按'score'这个字段升序排序
### 5.1.6 数据框分组汇总
sample.head()
sample.groupby('class') [['math']].max()
sample.groupby('class')[['math']].count()
sample.groupby('class')[['math']].mean()
sample.groupby('class')[['math']].sum()
df
-------------------------------------------------------拆分、堆叠列---------------------------------------------### 5.1.7 拆分、堆叠列
'type':['Normal','Special_offer',
'Normal','Special_offer','Special_offer'],
'Monetary':[3608,420,1894,3503,4567]})
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
fill_value=0,aggfunc='sum')
columns='type',
values='Monetary',
fill_value=0,
aggfunc=np.sum).reset_index()
table1
id_vars='cust_id',
value_vars=['Normal','Special_offer'],
value_name='Monetary',
var_name='TYPE')
--------------------------------------------------数据框横向连接/纵向连接-------------------------------------------### 5.1.3 横向连接
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})#### 1. 数据框内连接,类似inner join
#### 2. 左连接,类似left join
df1.merge(df2,how='left',on='id')#### 3. 右连接,类似right join
#### 3. 行索引连接
'col1':['a','b','c']},
index = [1,2,3])
df2 = pd.DataFrame({'id2':[1,2,3],
'col2':['aa','bb','cc']},
index = [1,3,2])df1.join(df2)
### 5.1.4 纵向合并
'col':['a','a','b','c','v','e','q']})
df2 = pd.DataFrame({'id':[1,2,3,3,5],
'col':['x','y','z','v','w']})
df3 = df1.rename(columns = {'col':'new_col'})#数据框某列改名5.1 数据整合
### 5.1.1 行列操作
#### 1. 单列
import numpy as np
sample = pd.DataFrame(np.random.randn(4, 5),
columns=['a','b','c','d','e'])
print(sample)
sample['a']#数据框选择单列
sample[['a','b']]#数据框随意选择两列的
sample[0:2]#数据框选择前两行
sample.iloc[1:2,0:2]#数据框iloc选择行与列的写法
sample.ix[:,'a']
sample[['a']]#### 2. 选择多行和多列
#### 3. 创建、删除列
sample数据框增加两列
new_col3 = sample['a'] + sample['b'])
sample_new### 5.1.2 条件查询
'Miki','Sully','Rose'],
'score':[98,78,87,77,65,67],
'group':[1,1,1,2,1,2],})
sample#### 1. 单条件
sample.score > 70
sample[sample.score>70]#### 2. 多条件
#### 3. 使用query
sample.query('score > 90')
#### 4. 其他数据框sample['score'].后面跟聚合函数sum,mean,count,between
sample[sample['name'].str.contains('[M]+')]#数据框字符过滤,有点像live### 5.1.8 赋值与条件赋值
#### 1. 赋值
'Miki','Sully','Rose'],
'score':[99,78,999,77,77,np.nan],
'group':[1,1,1,2,1,2],})
'name':{'Bob':np.nan}})#### 2. 条件赋值
if row['group'] == 1:
return ('class1')
elif row['group'] == 2:
return ('class2')
sample.apply(transform,axis=1)
sample.loc[sample.group==1,'class_n']='class1'
sample.loc[sample.group==2,'class_n']='class2'