本文只是我在平时处理数据时一些常用操作,一个备忘录而已。
##################################################读取
1.csv
csv_file_object = csv.reader(open('train.csv', 'rb'))
# Load in the csv file
header = csv_file_object.next()
# Skip the fist line as it is a header
data=[]
# Create a variable to hold the data
for row in csv_file_object:
# Skip through each row in the csv file,
data.append(row[0:])
# adding each row to the data variable
data = np.array(data)
# Then convert from a list to an array.
2.dataframe
train_df = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe
##########对于大文件,分批读入
def read_csv_chunk_ukt(filename,chunk_size=100000):
reader = pd.read_csv(filename, iterator=True)
loop = True
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunk_size)[
["user_id", "sku_id", "type","time"]]#选取想要的列
chunks.append(chunk)
except StopIteration:
loop = False
print "Iteration is stopped."
df = pd.concat(chunks, ignore_index=True)
return df
##################创建dataframe
re=pd.DataFrame({'del_buy':[result], 'user_id':[ group.user_id.values[0]] },
columns=['user_id','del_buy'] )
###############查看pandas 数据基本信息
train_df.info() #details for data ,look for missing items
train_df.describe() #count\mean\std\min\max...
###############################################类型转换
1.
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# convert it to integer classifiers.应用于元素级
2.
#from sklearn.preprocessing import Normalizer,LabelEncoder
all_pd.species=LabelEncoder().fit_transform(all_pd.species)
#fit transform可以分开为两步
3.将标称属性的字符串值映射为一个数字,相同的字符串映射为同一个数字。不同于dummy,这种映射最后只生成一个属性
tf['Sex']=pandas.factorize(tf['Sex'])[0]
#####################################删除
#根据列名删除,默认axis=0
t=trainMat.drop(['Age','Name'],axis=1)
#根据索引删除
temp=t[t['action_before_buy']==0]
tt=temp.index
t_last = t.drop(tt)
#####################################去处重复
df = df.drop_duplicates()
df = df.drop_duplicates(['user_id'])#选定特定列
#########################同时获取多列
1.
t=trainMat[['Age','Name']]
2.引入专门的索引字段 ix
data.ix[hang,lie] 多行列再加[]
3.遍历dataframe
temp=[]
for index, row in ui_pair.iterrows():#返回(index, Series)对
usr_id = row["user_id"]
sku_id = row["sku_id"]
# 寻找与user-item对有关的所有记录
temp.append(df[(df["user_id"] == usr_id) & (df["sku_id"] == sku_id)])
temp = pd.concat(temp, ignore_index=True)
##########################################
1.columns换名字
t.rename(columns={'date_received':'dates'},inplace=True)
2.reset index
all_df=pd.concat([train_df,test_df])
all_df=all_df.reset_index()#reset
all_df=all_df.drop('index',axis=1)
all_df=all_df.reindex_axis(train_df.columns, axis=1)#sorted by
#############################################合并
df_user_del_buy = pd.concat(df_user_del_buy, ignore_index=True) # 将多个子记录合并成一个dataframe
actions = pd.merge(actions, user, how='left', on='user_id')
##########################################缺省值
1.
##选用median 而不是mean也是为了减少缺省值的影响
_= train_df['Age'].fillna().median()
# All the ages with no data -> make the median of all Ages
2.
##掩码数组:表示的是不完整的数据或是含有无效值的数据
m = np.ma.masked_array(df['Age'], np.isnan(df['Age']))
mean = np.mean(m).astype(int)
df['Age'] = df['Age'].map(lambda x : mean if np.isnan(x) else int(x))
3.
df['Age'][df['Age'].isnull()]=df['Age'].median()
RuntimeWarning: invalid value encountered in greater
has_large_values = (abs_vals > 1e6).any()
RuntimeWarning: invalid value encountered in less
has_small_values = ((abs_vals < 10**(-self.digits)) &
4.
df['Fare'] = df['Fare'].map(lambda x : 0 if np.isnan(x) else int(x)).astype(int)
#########################################分离数据集
1、random split into training and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.3,random_state=0)
2、
KFold(n=4, n_folds=2, shuffle=False,random_state=None)
#total element >=2 搅乱 shuffle=ture设置 划分随机
for train_index, test_index in kf:#0.17版本里面没有split
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
3、
sss = cross_validation.StratifiedShuffleSplit(y_dtrain, 10, test_size=0.2, random_state=23)
for train_index, test_index in sss:
X_train, X_test = x_dtrain[train_index], x_dtrain[test_index]
y_train, y_test = y_dtrain[train_index], y_dtrain[test_index]
#############################################增加数据
from keras.preprocessing.image import ImageDataGenerator
def _initial_generator():
return ImageDataGenerator(
rescale=1./255,# 与X_test 的处理不同啊
shear_range=0.1,
zoom_range=0.1,
width_shift_range=0.1,
height_shift_range=0.1,
horizontal_flip=True)
def train_generator_try():
gen_te=_initial_generator()
train_generator = gen_te.flow_from_directory(
train_data_dir,
target_size=(img_width, img_height),
batch_size=batch_size,
shuffle = True, # Important !!!
classes = FishNames,
class_mode = 'categorical')
return train_generator
##############################################训练
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='auto')
model.fit(x_train, y_train, batch_size=64, nb_epoch=50,
validation_split=0.2, verbose=1, shuffle=True, callbacks=[early_stopping])
#verbose:1为输出进度条,2为每个epoch输出一行记录
#callback:回调函数
########################################grid search
通常我们会通过一个叫做 Grid Search的过程来确定一组最佳的参数。
其实这个过程说白了就是根据给定的参数候选对所有的组合进行暴力搜索。
param_grid = {'n_estimators': [300, 500], 'max_features': [10, 12, 14]}
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=1, cv=10, verbose=20)
#model.best_params_ 获得最佳的参数
#model.grid_scores_
#########################################保存模型
1、在保存模型时,若遇到 RuntimeError: maximum recursion depth exceeded in cmp
import sys
sys.setrecursionlimit(10000)
########################################model_extract
from keras import backend as K
import theano
convout1_f = theano.function([model.layers[0].input], model.layers[3].output)
# output in train mode = 1
layer_output1=convout1_f(X_train)
###############################################计时
import time
start_time=time.time()
print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
##############################################sort&plot
# Output feature importance coefficients, map them to their feature name, and sort values
'''clf.fit(X_train,y)
coef = pd.Series(clf.feature_importances_, index = X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
coef.head(25).plot(kind='bar')
plt.title('Feature Significance')
plt.tight_layout()'''
##############################################写入
def submission(pred):
le = LabelEncoder().fit(train_pd['species'])
sub = pd.DataFrame(pred, columns=list(le.classes_))
sub.insert(0, 'id', test_pd.id)
sub.reset_index()
sub.to_csv('submit.csv', index = False)
##############################time series
result=datetime.strptime(last_buy_day, '%Y-%m-%d %H:%M:%S') - datetime.strptime(first_day, '%Y-%m-%d %H:%M:%S')
result=result.days
# 时间前移10天
time_s_datetime = pd.datetime.strptime(time , '%Y-%m-%d %H:%M:%S') - pd.Timedelta(days = n)
time_s_string = pd.datetime.strftime(time_s_datetime , '%Y-%m-%d %H:%M:%S')
##################################groupby操作
grouped = df_buy.groupby(['user_id'])
results = grouped.apply(best_sku)
t=df.groupby(['user_id','sku_id']).apply(find_del_buy)。
# 提取特征值,各项动作的次数
df_action_type_counts = df_action_p['type'].value_counts()