ETL 基本操作

由于工作中处理数据的工具太多,从 Oracle 到 MySQL、Python、PySpark、Scala 等造成数据操作语法很容易忘记、混淆,整理下常用基本操作。

Python

  • 日期处理
import datetime,time
# 字符串转日期
>>> datetime.datetime.strptime('20180103', '%Y%m%d')
datetime.datetime(2018, 1, 3, 0, 0)
# 日期转字符串
>>> datetime.datetime.strptime('20180103', '%Y%m%d').strftime('%Y%m%d')
'20180103'
# 日期间隔天数
>>> (datetime.datetime.now() - datetime.datetime.strptime('20180103', '%Y%m%d')).days
148
# 日期前 n 天
>>> datetime.datetime.strptime('20180103', '%Y%m%d') + datetime.timedelta(days=148)
datetime.datetime(2018, 5, 31, 0, 0)
# 时间戳转字符串
>>> datetime.datetime.fromtimestamp(1549434104).strftime('%Y%m%d%H%M%S')
'20190206142144'
>>> time.strftime('%Y%m%d',time.localtime(1537632512))
'20180923'
datetime.fromtimestamp(utc).strftime('%Y%m%d%H%M%S')

2.作图 matplotlib

参赛用户数总览-日期.png
import numpy as npx
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
from datetime import datetime
import matplotlib.dates as mdates

def datelist(beginDate, endDate):
    # beginDate, endDate是形如‘20160601’的字符串或datetime格式
    date_l=[datetime.strftime(x,'%Y%m%d') for x in list(pd.date_range(start=beginDate, end=endDate))]
    return date_l

city_cnt = pd.read_csv("C:/Users/xxx/Desktop/work/20180809CGL/x.txt", header=0, sep=",", index_col=False)
fig = plt.figure(figsize=(16,4))
ax = fig.add_subplot(111) #注意:一般都在ax中设置,不再plot中设置
xmajorFormatter = FormatStrFormatter('%d') #设置x轴标签文本的格式
ax.xaxis.set_major_formatter(xmajorFormatter)
ax.set_title("活动参与用户数-日期", fontsize=18)
color = ['red', 'green', 'blue', 'yellow', 'skyblue', 'pink', 'purple', 'grey']
dt_new = city_cnt.groupby('join_time').agg({'count':'sum'}).sort_index()
days = datelist(datetime(2018,7,10), datetime(2018,8,12))
index = np.arange(len(days))
bar_width = 0.55
opacity = 0.4
ax.bar(index, dt_new['count'], bar_width, alpha=opacity, color='green',label='New',align="center")
plt.xticks(index, days, fontsize = 12)
ax1 = ax.twinx()
ax1.set_ylim(1, 5000)
line_csm = ax1.plot(index, dt_new['count'].cumsum(), color='red', label= 'Total: ' + str(round(dt_new['count'].sum(),2)))
ax.plot(index, [dt_new['count'].mean()]*len(dt_new['count']), '-.', color='pink', label= 'Avg: ' + str(round(dt_new['count'].mean(),2)), linewidth=2,alpha=0.5)
ax.legend(loc='upper left')
ax1.legend(loc='upper right')
plt.gcf().autofmt_xdate()
plt.tight_layout()
plt.show()

PySpark

1.DataFrame 操作

# json 文件读取并扁平化操作
from pyspark.sql.functions import explode
>>> z = spark.read.json('/tmp/x.json')
>>> z.select('_id','tags').show(11,False)
+--------------------------+--------------------------+
|_id                       |tags                      |
+--------------------------+--------------------------+
|[5b4d9d797ecd702d52d4925c]|[mongodb, database, NoSQL]|
|[5bc05a33a8180ea2ebdb18f3]|[mongodb, database, NoSQL]|
+--------------------------+--------------------------+
>>> z.select(z._id,explode(z.tags)).show(11,False)
+--------------------------+--------+
|_id                       |col     |
+--------------------------+--------+
|[5b4d9d797ecd702d52d4925c]|mongodb |
|[5b4d9d797ecd702d52d4925c]|database|
|[5b4d9d797ecd702d52d4925c]|NoSQL   |
|[5bc05a33a8180ea2ebdb18f3]|mongodb |
|[5bc05a33a8180ea2ebdb18f3]|database|
|[5bc05a33a8180ea2ebdb18f3]|NoSQL   |
+--------------------------+--------+
# 加载 Parquet 文件
>>> x = sqlContext.read.load("/warehouse/data/song/TW_SONG_FTUR_D/2018/01/31/data.parquet")
>>> type(x)

# 通配符
# 匹配 0 个或多个字符:*
# 匹配 1 个字符:?
# 匹配其中任意一个字符:[abc]
# 匹配范围:0[1-9] 
# 匹配任意一个字符串:{08,09,10}

# 过滤、选择
val orders = sqlContext.read.load("/warehouse/raw/clientlog/TO_MINIK_CLIENT_SONG_PLAY_OPERATE_REQ_D/2018/04/*/data.parquet").filter("play_time>30 and order_id like 'W%' and optrate_type in (1,3)").select("order_id","songid").cache()

# 新增一列并处理(对数、指数)
tw_song_ftur_d.withColumn("RSI_O_D", pow(0.85 * log(col('SING_CNT') + 1) + 0.15 * log(col('USR_CNT') + 1), 2))
tw_song_ftur_d.withColumn('dt', lit(v_date)) #常数作为一列
# 中文列名导致报错(加反引号解决)
hit_songs.filter("`精彩评论` is null").show()
# 修改列名
>>> df.selectExpr("age * 2 as age2", "abs(age)").collect()
>>> play_cnt = song_play.groupBy("uid").agg(count('songid').alias('count'), countDistinct('songid').alias('dis_count'))

# 修改字段类型
x.selectExpr('DATA_DT','MID','cast(INV_RATE as double)')

# 新增一列采用自定义函数生成
from pyspark.sql.functions import *
def city(prov, addr):
    city = ''
    if '自治区' in addr:
        city = addr[addr.find('自治区')+3:addr.find('自治区')+5] if addr.find('市')==-1 else addr[addr.find('自治区')+3:addr.find('市')]
    elif '澳門' in addr:
        city = '澳門'
    elif '省' in addr:
        city = addr[addr.find('省')+1:addr.find('省')+3] if addr.find('市')==-1 else addr[addr.find('省')+1:addr.find('市')]
    else:
        city = addr[addr.find(prov):addr.find(prov)+2]
    return city[0:2].strip() if len(city)>4 else city.strip()

func_prv =  udf (prv)
TO_MNK_MAC_LOC_D.withColumn('p', func_prv(col('ADDR'))).withColumn('c', func_city(col('p'),col('ADDR'))).registerTempTable("TO_MNK_MAC_LOC_D")

# 指定 UDF 函数返回类型(默认 str 排序出错)
>>> @udf(returnType='int')
... def udf_len(str):
...   return len(str)
x.withColumn('z',udf_len('singer')).orderBy(col('z').desc()).select('z','singer','name').show(5,False)

# 按某列排序
tw_song_ftur_d.orderBy(tw_song_ftur_d.RCT_30_SING_CNT.desc()).show()
from pyspark.sql.functions import *
tw_song_ftur_d.orderBy(col("RCT_30_SING_CNT").desc()).show()

# 窗口函数
from pyspark.sql.window import Window
window_spec = Window.partitionBy(TO_HSKP_MAC_STORE_MAP_D.STORE_ID).orderBy(col('CREAT_TM').desc())
TO_HSKP_MAC_STORE_MAP_D.withColumn('rank', row_number().over(window_spec))
df = sqlContext.createDataFrame([
    ["Student A", 1, "Science", 10],
    ["Student B", 1, "Science", 20],
    ["Student C", 2, "Science", 30],
    ["Student CC", 2, "Science", 30],
    ["Student D", 2, "Science", 40],
    ["Student D", 3, "Science", 50],
    ["Student E", 4, "Art", 10],
    ["Student F", 4, "Art", 20],
    ["Student G", 5, "Art", 30],
    ["Student H", 5, "Art", 40],
    ["Student I", 6, "Art", 50],
    ], ["name", "class", "subject", "score"])
## 相等同序号、总行号为总行数
df.withColumn('rank', rank().over(Window.partitionBy('subject').orderBy(col('score').desc()))).show()
## 相等随机序号、总行号为总行数
df.withColumn('rank', row_number().over(Window.partitionBy('subject').orderBy(col('score').desc()))).show()

## 多行转一行(一行转多行)
grp_uid = Window.partitionBy('uid')
play_list = play.withColumn('lst', collect_list('songid').over(grp_uid)).groupBy('uid').agg(max('lst').alias('song_list'))
>>> play_list 
DataFrame[uid: bigint, song_list: array]
play_lists = play_list.withColumn('songid', explode(col('song_list')))

## json 一行转多行
ejson = spark.read.load("/tmp/enterprise/json/*.json", format="json")
edf = ejson.select(explode("erDataList"))
>>> ejson.printSchema()
root
 |-- erDataList: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- businessScope: string (nullable = true)
 |    |    |-- capital: string (nullable = true)
 |    |    |-- character: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- legalRepresentative: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- province: string (nullable = true)
 |    |    |-- registrationDay: string (nullable = true)

>>> e_df = ejson.select(explode("erDataList"))
>>> e_df.printSchema()
root
 |-- col: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- businessScope: string (nullable = true)
 |    |-- capital: string (nullable = true)
 |    |-- character: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- legalRepresentative: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- province: string (nullable = true)
 |    |-- registrationDay: string (nullable = true)

### array 过滤
1. play_list.where(array_contains("song_list","lx143416"))
2. sql: select .. from ... where array_contains("song_list","lx143416")


# 聚合
play_cnt = song_play.groupBy("uid").agg(countDistinct('songid'))
df.groupBy("group").agg({"money":"sum"}).withColumnRenamed("SUM(money)", "money").show()

# 累加
play_sum = spark.sql('select dt,songid, sum(count) over(partition by songid order by dt asc) as sum_sing from SS')


# 时间函数
date_format(join_time,'yyyyMMdd')
from_unixtime(ACT_TM,'yyyy-MM-dd HH:mm:ss') 
to_unix_timestamp(substr(ACT_TM,0,14), 'yyyyMMddHHmmss')

# 导出到 csv
df.write.csv("/dir/")
image.png

2.Excel 转 DataFrame

import xlrd
from pyspark.sql import Row
from pyspark.sql import SQLContext, HiveContext

data = xlrd.open_workbook("/apps/zengh/删除歌单2.xlsx")
tables = data.sheets()

lines = []

for table in tables:
  for line in range(1,table.nrows):
      lines.append(table.row_values(line))
      
rdd = sc.parallelize(lines)
df = sqlContext.createDataFrame(rdd.map(lambda line: Row(songname=line[0],singer=line[1])))
df.registerTempTable("SONG_CPRT")

3.SparkSQL

# 日期函数,join_time 为 timestamp 类型
date_format(join_time,'yyyyMMdd')  
datediff(to_date('2018-07-31'),to_date(concat_ws('-', substring(B.ACTV_DT,0,4),substring(B.ACTV_DT,5,2),substring(B.ACTV_DT,7,2))))

MySQL

  • 日期处理
# 时间、时间戳互转
> select unix_timestamp(now())
| 1528098214          
> select from_unixtime(1527512458);
| 2018-05-28 21:00:58       |
# 时间、字符串互转
> select date_format(now(), '%Y%m%d');
| 20180604                     |
> select str_to_date('20180604','%Y%m%d');
| 2018-06-04                       |
# 时间戳、字符串互转
select from_unixtime(1527512458, '%Y%m%d');
| 20180528                            |
select unix_timestamp('20160102');
select unix_timestamp('2016-01-02');
select unix_timestamp(str_to_date('20180102', '%Y%m%d'));
| 1451664000                 |


你可能感兴趣的:(ETL 基本操作)