pandas大数据分析的常见用法

#coding=utf8
import cfg
from impala.dbapi import connect
import pandas as pd
import numpy as np


conn = connect(host=cfg.testip, port=10000, database='dw', auth_mechanism='PLAIN')
sql = "select * from a"
sql1= 'select * from b'
# con传入连接数据库的引擎
s1 = pd.read_sql(sql=sql, con=conn)
s2 = pd.read_sql(sql1,conn)

r2 = s1.head()
#将列名都去掉表名
s1.columns = s1.columns.str.replace('a.','')
s2.columns = s2.columns.str.replace('b.','')
#获取前三行
# a= s1[['question_id',\
#        'subject_base_id']].head()
#
# print(a)

#统计方法,不想对某列聚合用drop,可以对每个分组进行多个统计量
# count = s1.drop('subject_base_id').groupby('question_id').agg([np.mean,np.sum])
# min = s1.min()
# print(count,min)
#
# m = s1.groupby('paper_question_id').mean()
# print(m)

#查询不同的列
# a = s1[['paper_question_id','sub_grade']]

#获取列名
# print(s1.columns,s2.columns)
#多表连接,默认内连接,若指定连接:how='left',on后面可以跟列表,多个连接条件,[1:10]切片获取行
newt = pd.merge(s1,s2,on=['question_id','subject_base_id'])[['question_id','subject_base_id']][:50]
#获取指定的行,ix
# row = newt.ix[[1,4]]

#根据筛选条件查询,& 或者|
re = newt[(newt['question_id']==4802574)&(newt['subject_base_id']==1)]
# print(re)

# 排序
sor = newt.sort_values(by=['question_id','subject_base_id'])
print(sor)



你可能感兴趣的:(python)