python3 连接hive 的环境安装 |
1、pip install six |
2、pip install bit_array |
3、pip install thriftpy ##注意: thrift (on Python 2.x) or thriftpy (on Python 3.x) |
4、pip install thrift_sasl |
5、pip install impyla |
6、pip install sasl |
查找sasl文件的路径 |
①.wheel address:https://www.lfd.uci.edu/~gohlke/pythonlibs/#sasl |
②.打开①中的连接后,CTRL+F 输入sasl 找到对应的版本 |
③.打开anaconda的安装文件,修改 D:\Anaconda3\Lib\site-packages\thriftpy\parser 488行的代码。注销if url_scheme =='': 改为 if len(url_scheme)<= 1: |
④.直接pip 这个文件就可以 |
from impala.dbapi import connect #引入此包 与hive 建立连接
from impala.util import as_pandas # 把hive查询结果 直接转化为pandas
import pandas as pd
import pymysql
import xlwt #excel 操作包
import re
pd.set_option("display.width",5000)
def hive():
cur = connect(host='192.168.0.0',port=10000,database='default').cursor()
print('hive 连接成功')
cur.execute("select * from dianpu_new where pub_date between '2019-03-31'and '2019-03-31'")
hive_data = as_pandas(cur)
#print("hive_data",hive_data)
cur.close()
return hive_data
def dataframe_manage(hive_data):
df = hive_data
#columns_name = [ column for column in df.columns.tolist()] #列表解析表达式
print("columns_name", columns_name )
df.columns = [(column.replace('dianpu_new.','')) for column in df.columns.tolist()]
print("df.columns",df.columns)
df = df.drop(df[df['address']==None].index.tolist())
df.dropna(how='any',axis=0)
return df
if __name__ == '__main__':
hive_data=hive()
df = dataframe_manage(hive_data)
from impala.dbapi import connect
from impala.util import as_pandas
import pandas as pd
import pymysql
import xlwt
import re
import time
from time import ctime, sleep
import collections
pd.set_option("display.width",5000)
startTime = time.time()
def hive():
cur = connect(host='192.168.0.0',port=10000,database='default').cursor()
print('hive 连接成功')
cur.execute("select * from dianpu_new where pub_date between '2019-03-31'and '2019-03-31'")
hive_data = as_pandas(cur)
m,n= hive_data.shape
#print("m,n",m,n)
cur.close()
return hive_data
def mysql():
conn=pymysql.connect("192.168.0.0","data_user","first2018qaz","FBDdata",charset='utf8')
sql = "SELECT * FROM house_info where date between '2019-03-31'and '2019-03-31'"
sql_data = pd.read_sql(sql,conn)#高阶用法,pandas 直接读取mysql 数据库数据
return sql_data
def dataframe_manage(df):
columns_name = [ column for column in df.columns.tolist()] # df.columns.tolist() 可以把df的各列获取列名,转化为列表
df.columns = [(column.replace('dianpu_new.','')) for column in df.columns.tolist()]#列表解析式替代每一列中指定值
df = df.drop(df[df['address']==None].index.tolist())#去除address 有None的值
df.dropna(how='any',axis=0)#删除异常值,即有任何一个为空 则删除整行
return df
def address_menarge(df):
address = df['address'].tolist()
address_list = [str(i).split('-') for i in address]
address1 = []
address2 = []
for i in address_list:
if i[0] == ''and i[1] != '':
address1.append('1')
address2.append(i[1])
elif i[0] == 'None':
address1.append('1')
address2.append('1')
elif i[1] == '' and i[0] !='':
address1.append(i[0])
address2.append('1')
elif i[1] == '' and i[0] =='':
address1.append('1')
address2.append('1')
else:
address1.append(i[0])
address2.append(i[1])
df['address1'],df['address2'] = address1,address2
df = df.drop(df[df['address1'].isin(['1','其他区'])].index.tolist())
return df
def Web_Analysis(df,df_ffx):
#--------各网络平台房源发布统计--------
#from_web 1为58,2为安居客
# wb 代表58 aj代表安居客 first代表菲斯特地产 ffx房发现
df_wb = df.loc[df['from_web']==1]
df_aj = df.loc[df['from_web']==2]
df_first = df[df['agent_company'].isin(['菲斯特房地产','成都菲斯特房地产营销策划有限公司'])]
wb_counts = df_wb.shape[0]#df.shape 是同时获取行列数。
aj_counts = df_aj.shape[0]
first_counts = df_first.shape[0]
ffx_counts = df_ffx.shape[0]
counts_list = [wb_counts,aj_counts,first_counts,ffx_counts]
#.shape[0] 主要是去重 只显示统计数据,如果不要,会显示agent_name 及对应的数量
wb_agent = df_wb['agent_name'].value_counts().shape[0]
aj_agent = df_aj['agent_name'].value_counts().shape[0]
first_agent = df_first['agent_name'].value_counts().shape[0]
ffx_agent = df_ffx['agent_name'].value_counts().shape[0]
agent_list = [wb_agent,aj_agent,first_agent,ffx_agent]
avg = [round(wb_counts/wb_agent),round(aj_counts/aj_agent),round(first_counts/first_agent),round(ffx_counts/ffx_agent)]
avg_sum = round(sum(counts_list)/sum(agent_list))
data = collections.OrderedDict([('网站来源',['58同城','安居客','菲斯特','房发现平台','合计']),
('发布条数',[wb_counts,aj_counts,first_counts,ffx_counts,sum(counts_list)]),
('经纪人数',[wb_agent,aj_agent,first_agent,ffx_agent,sum(agent_list)]),
('人均发布数',[round(wb_counts/wb_agent),round(aj_counts/aj_agent),round(first_counts/first_agent),\
round(ffx_counts/ffx_agent),avg_sum])])
from_web = pd.DataFrame(data)
return from_web
def Region_Analysis(df):
#------区域排名--------------------------------------
#高阶用法 把各条记录的行政区域 去重后,名称统计出来,按名称组成一个列表返回,
#如果不要index,返回的是按名称的计数而不是名称。
region_names= df['address1'].value_counts().index.tolist()#不同值出现的次数,并按降序返回对应名称
region_counts = df['address1'].value_counts().tolist()#不同值 已按counts后的结果 降序排列好
#print("region_names",region_names)
#高阶用法 列表解析表达式循环每一个counts,求占比后 用format 格式化输出2位小数的占比。
proportion = ['{:.2%}'.format(i/sum(region_counts)) for i in region_counts]
rank = [i for i in range(1,len(region_names)+1)]
data = collections.OrderedDict([
('排名',rank),
('区域名称',region_names),
('发布条数',region_counts),
('占比',proportion)
])
region_rank =pd.DataFrame(data)
return region_rank
def Totalp_Analysis(df):
#总价处理
rank = [i for i in range(1,9)]
bin = [0,50,100,200,300,400,500,1000, float("inf")]#区间划分 最大的区间通常是>= 所以用 inf
'''
cut(x,bins,right=True,labels=None,retbins=False,precision=3,include_lowest=False)
需要将数据值分段并排序到bins中时使用cut,此函数对于从连续变量转换为离散变量很有用。
例如,cut可以将年龄转换为年龄范围组。 支持bins到相同数量的箱柜或预先指定的bins阵列。
x:进行划分的一维数组,或者是df的一列
bins : 整数---将x划分为多少个等间距的区间;也可以是自定义指定的区间
right : 是否包含右端点 默认包含右端点,不包含左端点
labels : 是否用标记来代替返回的bins 如labels=['1','2','3','4','5','6','7','8'] 代替总价的8个区间
retbins: 是否返回间距bins
precision: 精度
include_lowest:是否包含左端点
参考链接:https://blog.csdn.net/sir_TI/article/details/83512418
qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
基于分位数的离散化功能。 根据等级或基于样本分位数将变量分离为相等大小的桶。
'''
df_ttp = pd.cut(df['total_price'],bin)
price_counts = pd.cut(df['total_price'],bin).value_counts().tolist()
price_cut = pd.cut(df['total_price'],bin).value_counts().index.tolist()
price_names = ['总价: '+ str(i).replace(', ','-').strip('(').strip(']').replace('.0','').replace\
('0-50','<=50').replace('40<=500','400-500').replace('1000-inf','>1000') for i in price_cut]
proportion = ['{:.2%}'.format(i/sum(price_counts)) for i in price_counts]
data = collections.OrderedDict([
('排名',rank),
('总价区间 单位:万',price_names),
('发布条数',price_counts),
('占比',proportion)
])
totalp_rank = pd.DataFrame(data)
return totalp_rank
def Price_Analysis(df):
#单价处理
rank = [i for i in range(1,9)]
bin = [0,15000,20000,25000,30000,35000,40000,50000, float("inf")]#区间划分
price_counts = pd.cut(df['price'],bin).value_counts().tolist()
price_cut = pd.cut(df['price'],bin).value_counts().index.tolist()
#以下代码可以优化 设置labels参数,返回labels ,然后替换labels ,避免代码太长
price_names = ['单价: '+ str(i).replace('(0.0, 15000.0]','<1.5万').replace('(15000.0, 20000.0]','1.5-2万')\
.replace('(20000.0, 25000.0]','2-2.5万').replace('(25000.0, 30000.0]','2.5-3万')\
.replace('(30000.0, 35000.0]','3-3.5万').replace('(35000.0, 40000.0]','3.5-4万')\
.replace('(40000.0, 50000.0]','4-5万').replace('(50000.0, inf','>5万') for i in price_cut]
proportion = ['{:.2%}'.format(i/sum(price_counts)) for i in price_counts]
data = collections.OrderedDict([
('排名',rank),
('单价区间',price_names),
('发布条数',price_counts),
('占比',proportion)
])
price_rank = pd.DataFrame(data)
return price_rank
def Acreage_Analysis(df):
#面积处理
rank = [i for i in range(1,6)]
bin = [0,50,100,150,200, float("inf")]#区间划分
acreage_counts = pd.cut(df['acreage'],bin).value_counts().tolist()
acreage_cut = pd.cut(df['acreage'],bin).value_counts().index.tolist()
acreage_names = ['面积: '+ str(i).replace(', ','-').strip('(').strip(']').replace('.0','')\
.replace('0-50','50').replace('200-inf','>200') for i in acreage_cut]
proportion = ['{:.2%}'.format(i/sum(acreage_counts)) for i in acreage_counts]
data = collections.OrderedDict([
('排名',rank),
('面积区间',acreage_names),
('发布条数',acreage_counts),
('占比',proportion)
])
acreage_rank = pd.DataFrame(data)
return acreage_rank
def Top5totalp_Analysis(df):
#前5地区总价排名
bin = [0,50,100,200,300,400,500,1000, float("inf")]#区间划分
top5_region = df['address1'].value_counts().index.tolist()[:5]#按行政区域统计,然后计数,直接去前5的区域名称
counts_list = []
'''
enumerate(iterable, start) 如果不传入start 参数。默认从0开始。
'''
for k,v in enumerate(top5_region):
Top = pd.cut(df[df['address1'] == top5_region[k]]['total_price'],bin).value_counts()
df1 = pd.DataFrame(data={'bin':Top.index,'count':Top.values})
df1.sort_values('bin',inplace=True)
counts_list.append(df1['count'].tolist())
data = collections.OrderedDict([('总价区间 单位:万',['总价1:<=50','总价2:50-100','总价3:100-200','总价4:200-300','总价5:300-400',\
'总价6:400-500','总价7:500-1000','总价8:>1000']),
(top5_region[0],counts_list[0]),#高阶应用 列名动态索引 。对应的每行值是一个列表
(top5_region[1],counts_list[1]),
(top5_region[2],counts_list[2]),
(top5_region[3],counts_list[3]),
(top5_region[4],counts_list[4])
])
Top5totalp_df = pd.DataFrame(data)
return Top5totalp_df
def Top5price_Analysis(df):
#前5地区单价排名
bin = [0,15000,20000,25000,30000,35000,40000,50000,float("inf")]#区间划分
top5_region = df['address1'].value_counts().index.tolist()[:5]
counts_list = []
for k,v in enumerate(top5_region):
Top = pd.cut(df[df['address1'] == top5_region[k]]['price'],bin).value_counts()
df1 = pd.DataFrame(data={'bin':Top.index,'count':Top.values})
df1.sort_values('bin',inplace=True)
counts_list.append(df1['count'].tolist())
data = collections.OrderedDict([('单价区间',['单价1:<1.5万','单价2:1.5-2万','单价3:2-2.5万','单价4:2.5-3万','单价5:3-3.5万',
'单价6:3.5-4万','单价7:4-5万','单价8:>5万']),
(top5_region[0],counts_list[0]),
(top5_region[1],counts_list[1]),
(top5_region[2],counts_list[2]),
(top5_region[3],counts_list[3]),
(top5_region[4],counts_list[4])
])
Top5price_df = pd.DataFrame(data)
return Top5price_df
def Top5acrege_Analysis(df):
#前5地区面积排名
bin = [0,50,100,150,200,float("inf")]#区间划分
top5_region = df['address1'].value_counts().index.tolist()[:5]
counts_list = []
for k,v in enumerate(top5_region):
Top = pd.cut(df[df['address1'] == top5_region[k]]['acreage'],bin).value_counts()
df1 = pd.DataFrame(data={'bin':Top.index,'count':Top.values})
df1.sort_values('bin',inplace=True)
counts_list.append(df1['count'].tolist())
data = collections.OrderedDict([('面积区间 单位:m²',['面积1:<50','面积2:50-100','面积3:100-150','面积4:150-200','面积5:>200']),
(top5_region[0],counts_list[0]),
(top5_region[1],counts_list[1]),
(top5_region[2],counts_list[2]),
(top5_region[3],counts_list[3]),
(top5_region[4],counts_list[4])
])
Top5acrege_df = pd.DataFrame(data)
return Top5acrege_df
if __name__ == '__main__':
hive_data = hive()
mysql_data = mysql()
hive_df = dataframe_manage(hive_data)
hive_df = address_menarge(hive_df)
from_web = Web_Analysis(hive_df,mysql_data)
region_rank = Region_Analysis(hive_df)
totalp_rank = Totalp_Analysis(hive_df)
price_rank = Price_Analysis(hive_df)
acreage_rank = Acreage_Analysis(hive_df)
Top5totalp_df = Top5totalp_Analysis(hive_df)
Top5price_df = Top5price_Analysis(hive_df)
Top5acrege_df = Top5acrege_Analysis(hive_df)
with pd.ExcelWriter( r'C:\\Users\\yuanye\\Desktop\\fang_yuan\\fang_tan.xlsx') as writer:
from_web.to_excel(writer, sheet_name='网络来源',index=False)
region_rank.to_excel(writer, sheet_name='区域排名',index=False)
totalp_rank.to_excel(writer, sheet_name='总价排名',index=False)
price_rank.to_excel(writer, sheet_name='单价排名',index=False)
acreage_rank.to_excel(writer, sheet_name='面积排名',index=False)
Top5totalp_df.to_excel(writer, sheet_name='总价区域',index=False)
Top5price_df.to_excel(writer, sheet_name='单价区域',index=False)
Top5acrege_df.to_excel(writer, sheet_name='面积区域',index=False)
print("完成第一批统计")
endTime = time.time()
print ('Done, Time cost: %s ' % (endTime - startTime))