from pyhive import hive
import pymysql
import pandas as pd
import numpy as np
from pyspark.sql import SQLContext, SparkSession, Row , functions
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
data = [
[1, "nm", {'job': 'ba', 'age': 30}, ['cash', 'bill']]]
schema = """
id integer,
name string,
info struct,
interest array
"""
dfsp = spark.createDataFrame(data=data, schema=schema)
dfsp.createOrReplaceTempView('tt')
spark.sql('select count(*) from tt').show()
spark.udf.register('xx', xx)
dfsp.columns
dfsp.collect()
dfsp.select('id', 'name', 'info').describe().show()
dfsp.select(dfsp.name).show()
dfsp.filter(dfsp.id == 1).show()
dfsp.withColumn('upper_name', functions.upper(dfsp.name)).show()
dfsp.groupby('name').agg({'id': 'sum'}).show()
dfsp.show()
dfsp.write.csv('11.csv', header=True)
spark.read.csv('11.csv', header=True).show()
dfsp.toPandas()
conn = pymysql.connect(host='localhost', port=3306, user='root', password='123456', database='test')
df = pd.read_sql(sql, conn)
conn.cursor().execute(sql_c.encode('utf-8'))
conn = hive.Connection(host='169.168.12.2', port=10000, username='hive', database='cdm')
df = pd.read_sql(sql, conn)
conn.cursor().execute(sql_c)
df.to_sql(name='tabnm',conn,index=False,if_exists='replace')
df = pd.read_excel(r'C:\Users\Mavey\Desktop\2023\0000\12.xlsx',sheet_name='2')
df.to_excel('121.xlsx',sheet_name='21')
'''
#保存
writer=df.ExcelWriter('121.xlsx')
df.to_excel(writer,sheet_name='21')
df.to_excel(writer,sheet_name='01')
writer.save()
with pd.ExcelWriter('121.xlsx') as writer:
df.to_excel(writer,sheet_name='21')
df.to_excel(writer,sheet_name='01')
'''
df = pd.read_csv(r'C:\Users\2023\0000\12.csv')
'''
#header=0表示文件第一行为标题(header=[0,1]大小标题),names设置标题名称,设置索引列为序号,获取部分列
df = pd.read_csv(r'C:\Users\Mavey\Desktop\2023\0000\12.csv',sep=':',header=0,names=['序号','日期','金额'],index_col='序号',usecols=[0,2,3])
#查看行列数
print(df.shape)
#分段读取,每段2行
df=pd.read_csv('12.csv',chunksize=2)
#next(df)
chunk=[]
for cc in df:
chunk.append(cc)
#合并数据
ddf=pd.concat(chunk,axis=0)
#分段读取
df=pd.read_csv('12.csv',iterator=True)
while True:
try:
cc=df.get_chunk(2)
chunk.append(cc)
except:
break
ddf=pd.concat(chunk)
'''
df.to_csv('12.csv')
'''
#追加数据
df.to_csv('12.csv',mode='a',header=None)
'''
df1.head()
df1["long_ip"]=df1.apply(lambda row : int(row['ip'].split('.')[0])*256**(3-0)+int(row['ip'].split('.')[0])*256**(3-0)+int(row['ip'].split('.')[1])*256**(3-1)+int(row['ip'].split('.')[2])*256**(3-2)+int(row['ip'].split('.')[3])*256**(3-3),axis=1)
df2=df2.drop(columns=['start','end'])
df2.drop_duplicates(inplace = True)
df1['key'] = 1
df2['key'] = 1
df = pd.merge(df1, df2, on='key', how='outer')
df = df.drop(columns='key')
df = df[(df.long_ip >= df.long_ip_start) & (df.long_ip <= df.long_ip_end)]
df["pt_dt"]=df.apply(lambda row : row['time'][:10],axis=1)
df.groupby(['pt_dt','country','province']).sum().rename(columns={'user':'num'})
df['row_num'] = df.sort_values(['time'], ascending=True).groupby(['country', 'province']).cumcount() + 1
df = df[(df.row_num <= 3)]
df=pd.pivot_table(df, values=['user','time'], index=['country', 'province'], aggfunc={'user':np.max,'time':np.max},columns=['row_num'])
df.columns = ['user_1','user_2','user_3','time_1','time_2','time_3']
df=df.reset_index()
'''
df=df.reset_index()
df.columns = ['country', 'province','user_1','user_2','user_3','time_1','time_2','time_3']
'''
cols = dfd.columns.tolist()
new_cols = ['country','province','user_1','time_1','user_2','time_2','user_3','time_3']
df = df.reindex(columns=new_cols)