from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mu').master('local').getOrCreate()
df = spark.read.parquet(parquet_file)
df = spark.read.csv(csv_file)
df = spark.read.json(json_file)
df = spark.createDataFrame(RDD, schema)
df = rdd.toDF(*cols)
#与Pandas DataFrame之间的相互转换:
df = spark.createDataFrame(pandas_df)
pandas_df = df.toPandas()
查看数据情况:
df = spark.createDataFrame([('1', 'Joe', '70000', '1'), ('2', 'Henry', '80000', None)],
['Id', 'Name', 'Sallary', 'DepartmentId'])
# 1.查看title
df.printSchema()
# 输出
root
|-- Id: string (nullable = true)
|-- Name: string (nullable = true)
|-- Sallary: string (nullable = true)
|-- DepartmentId: string (nullable = true)
# 2.查看数据基本统计情况
df.describe().show()
# 输出
+-------+------------------+-----+-----------------+------------+
|summary| Id| Name| Sallary|DepartmentId|
+-------+------------------+-----+-----------------+------------+
| count| 2| 2| 2| 1|
| mean| 1.5| null| 75000.0| 1.0|
| stddev|0.7071067811865476| null|7071.067811865475| NaN|
| min| 1|Henry| 70000| 1|
| max| 2| Joe| 80000| 1|
+-------+------------------+-----+-----------------+------------+
# 3.查看数据列
df.columns
# 输出
['Id', 'Name', 'Sallary', 'DepartmentId']
# 4.查看行数
df.count()
# 输出
2
# 5.查看各列非空记录数量
from pyspark.sql.functions import count
df.agg(*[count(c).alias(c) for c in df.columns]).show()
# 输出
+---+----+-------+------------+
| Id|Name|Sallary|DepartmentId|
+---+----+-------+------------+
| 2| 2| 2| 1|
+---+----+-------+------------+
# 列的选择通过select实现
df.select('Id').show()
# 输出
+---+
| Id|
+---+
| 1|
| 2|
+---+
# 多列筛选
df.select('Id','Name').show()
# 输出
+---+-----+
| Id| Name|
+---+-----+
| 1| Joe|
| 2|Henry|
+---+-----+
# 使用列序号筛选
df.select(df.columns[0:2]).show()
# 输出
+---+---+
| a| b|
+---+---+
|1.0|NaN|
|NaN|2.0|
+---+---+
# 行筛选,通过filter或者直接[]加条件,后者与pandas中操作一致
df.filter((df['Id']==1)&(df['Name']=='Joe')).show()
df[(df['Id']==1)&(df.Name=='Joe')].show()
# 输出
+---+----+-------+------------+
| Id|Name|Sallary|DepartmentId|
+---+----+-------+------------+
| 1| Joe| 70000| 1|
+---+----+-------+------------+
# 高级操作:分层采样 df.sampleBy(col, fractions, seed=None),根据某一列类别来进行抽样,用来进行分层抽样
df = spark.createDataFrame([('a',1),('a',2),('a',3),('a',1),('b',1),('b',2)],['key', 'val'])
df.sampleBy('key', fractions={'a':0.2,'b':0.6}, seed=123).show()
+---+---+
|key|val|
+---+---+
| a| 1|
| b| 1|
| b| 2|
+---+---+
# 简单抽样
df.sample(fraction, seed)
# 1.删除一列
df.drop('length').show()
# 2.修改列使用withColumn
## 先进行修改列,注意到Sallary为string类型,所以先修改为int
root
|-- Id: string (nullable = true)
|-- Name: string (nullable = true)
|-- Sallary: string (nullable = true)
|-- DepartmentId: string (nullable = true)
df = df.withColumn('Sallary',df['Sallary'].cast('int'))
df.printSchema()
## 输出,可以看到Sallary类型已经修改
root
|-- Id: string (nullable = true)
|-- Name: string (nullable = true)
|-- Sallary: integer (nullable = true)
|-- DepartmentId: string (nullable = true)
## 判断Sallary是否大于1000,如果是就为1,不是就为0
### 方法1 先生成Bool值然后转换类型
df.withColumn('Sallary',(df['Sallary']>1000).cast('int')).show()
### 方法2 When函数,更加灵活
from pyspark.sql.functions import when
df = df.withColumn('Sallary',when(df['Sallary']>1000,1).otherwise(0))
df.show()
## 输出
+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
| 1| Joe| 1| 1|
| 2|Henry| 1| null|
+---+-----+-------+------------+
## 将str列中数字替换为--
from pyspark.sql.functions import regexp_replace
df = spark.createDataFrame([('100-200',)], ['str'])
df.show()
df.select(regexp_replace('str', r'(\d+)', '--').alias('d').show()
+-------+
| str|
+-------+
|100-200|
+-------+
+-----+
| d|
+-----+
|-----|
+-----+
# 3.增加列仍然才用withColumn方法
## 增加一列value全为0的列
from pyspark.sql.functions import lit
df.withColumn('newCol', lit(0)).show()
## 输出
+---+-----+-------+------------+------+
| Id| Name|Sallary|DepartmentId|newCol|
+---+-----+-------+------------+------+
| 1| Joe| 70000| 1| 0|
| 2|Henry| 80000| null| 0|
+---+-----+-------+------------+------+
## 增加列可为两列的运算值
from pyspark.sql.functions import lit
df = df.withColumn('newCol', lit(1))
df.withColumn('new',df['Sallary']+df['newCol']).show()
## 输出
+---+-----+-------+------------+------+-----+
| Id| Name|Sallary|DepartmentId|newCol| new|
+---+-----+-------+------------+------+-----+
| 1| Joe| 70000| 1| 1|70001|
| 2|Henry| 80000| null| 1|80001|
+---+-----+-------+------------+------+-----+
##增加一列为new,当Id为1则两列字符串用 '_'连接,采用自定义函数方法添加新列
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def cho(x,y):
return str(x) + '_' + str(y)
cho1 = udf(cho,StringType())
df.withColumn('new',F.when(df['id']==1,cho1(df.Id,df.Name)).otherwise(df['Id']).alias('new')).show()
## 输出
+---+-----+-------+------------+-----+
| Id| Name|Sallary|DepartmentId| new|
+---+-----+-------+------------+-----+
| 1| Joe| 70000| 1|1_Joe|
| 2|Henry| 80000| null| 2|
+---+-----+-------+------------+-----+
##某些列是自带一些常用的方法的
df1.withColumn('Initial', df1.LastName.substr(1,1)).show()
# 4.修改列名
data = spark.createDataFrame([("Alberto", 2), ("Dakota", 2)],
["Name", "askdaosdka"])
data.show()
data.printSchema()
# 输出
+-------+----------+
| Name|askdaosdka|
+-------+----------+
|Alberto| 2|
| Dakota| 2|
+-------+----------+
root
|-- Name: string (nullable = true)
|-- askdaosdka: long (nullable = true)
## 方法1:selectExpr
df = data.selectExpr("Name as name", "askdaosdka as age")
df.show()
df.printSchema()
## 输出
+-------+---+
| name|age|
+-------+---+
|Alberto| 2|
| Dakota| 2|
+-------+---+
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
**注:这里只是简单的重命名操作,其实selectExpr非常强大,所有sql相关的操作其实都可以**
#例子
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)
## 方法2:alias
from pyspark.sql.functions import col
data = data.select(col("Name").alias("name"), col("askdaosdka").alias("age"))
data.show()
## 输出
+-------+---+
| name|age|
+-------+---+
|Alberto| 2|
| Dakota| 2|
+-------+---+
## 方法3:当列比较多的时候,可以直接使用withColumnRenamed & reduce方法
from functools import reduce
oldColumns = data.schema.names
newColumns = ["name", "age"]
df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), data)
df.printSchema()
df.show()
## 输出
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
+-------+---+
| name|age|
+-------+---+
|Alberto| 2|
| Dakota| 2|
+-------+---+
## notice that this method allows you to "overwrite" the same column
df.sort('Sallary',ascending=False)
# 输出
+---+-----+-------+------------+------+
| Id| Name|Sallary|DepartmentId|newCol|
+---+-----+-------+------------+------+
| 2|Henry| 80000| null| 1|
| 1| Joe| 70000| 1| 1|
+---+-----+-------+------------+------+
# 多字段排序
color_df.filter(color_df['length']>=4)\
.sort('length', 'color', ascending=False).show()
# 混合排序
color_df.sort(color_df.length.desc(), color_df.color.asc()).show()
# orderBy也是排序,返回的Row对象列表
color_df.orderBy('length','color').take(4)
authors = [['Thomas','Hardy','June 2,1840'],
['Thomas','Hardy','June 2,1840'],
['Thomas','H',None],
['Jane','Austen','16 December 1775'],
['Emily',None,None]]
df1 = spark.createDataFrame(authors,schema=["FirstName","LastName","Dob"])
df1.show()
# 输出
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2,1840|
| Thomas| Hardy| June 2,1840|
| Thomas| H| null|
| Jane| Austen|16 December 1775|
| Emily| null| null|
+---------+--------+----------------+
# 删除重复值行
df1.dropDuplicates().show()
# 输出
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Jane| Austen|16 December 1775|
| Emily| null| null|
| Thomas| Hardy| June 2,1840|
| Thomas| H| null|
+---------+--------+----------------+
# 只要某一列有重复值,则去重
df1.dropDuplicates(subset=['FirstName']).show()
# 输出
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Emily| null| null|
| Jane| Austen|16 December 1775|
| Thomas| Hardy| June 2,1840|
+---------+--------+----------------+
有两种空值判断,一种是数值类型是nan,另一种是普通的None
# 类似 pandas.isnull
from pyspark.sql.functions import isnull, isnan
# 1.None 的空值判断
df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).show()
# 输出
+-----+-----+
| r1| r2|
+-----+-----+
|false|false|
| true| true|
+-----+-----+
# 2.nan的空值判断
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).show()
# 输出
+-----+-----+
| r1| r2|
+-----+-----+
|false|false|
| true| true|
+-----+-----+
df.na.drop 或者df.dropna()方法来丢掉空值行,使用df.na.fill 或者df.fillna()方法来使用某些值来替换空值
# 1.删除缺失值行
df.na.drop()
df.dropna()
# 如果一行至少2个缺失值才删除该行
df.na.drop(thresh=2).show()
# 2.填充缺失值
## 用均值替换缺失值
import math
from pyspark.sql import functions as F # 导入spark内置函数
# 计算缺失值,collect()函数将数据返回到driver端,为Row对象,[0]可以获取Row的值
mean_salary = df.select(F.mean('salary')).collect()[0][0]
clean_data = df.na.fill({'salary':mean_salary})
## 此方法中求均值可以使用numpy
import numpy
mean_salary = np.mean(df.select('salary').collect())
##对所有列用同一个值填充缺失值
df.fillna('unknown').show()
##不同的列用不同的值填充
df.na.fill({'LastName':'--', 'Dob':'unknown'}).show()
使用join进行列合并:
df1 = spark.createDataFrame([('a',1),('b',2),('c',3)],['x1','x2'])
+---+---+
| x1| x2|
+---+---+
| a| 1|
| b| 2|
| c| 3|
+---+---+
df2 = spark.createDataFrame([('a','T'),('b','F'),('d','T')],['x1','x3'])
+---+---+
| x1| x3|
+---+---+
| a| T|
| b| F|
| d| T|
+---+---+
## pyspark使用join进行联结
## 默认为inner join
df1.join(df2,on='x1').show()
## 输出,可以看出,做join后数据没有按原顺序进行排序
+---+---+---+
| x1| x2| x3|
+---+---+---+
| b| 2| F|
| a| 1| T|
+---+---+---+
## 另外需要注意的是如果两个df除了合并标识列之外仍然有相同列名,那最终结果会存在重复列名
# 如果是pandas,重复列会用_x,_y等后缀标识出来,但spark不会
# join会在最后的dataframe中存在重复列,解决重复列有两种方式:
df3 = df1.join(df2,on='x1')
# 还有一种比较直接的方式,对连接项【】起来即可
df1.join(df2,['x1'])
## 另外常见的还有left_outer 和outer
除了join方法外,合并行操作:
df1 = spark.createDataFrame([('a',1),('b',2),('c',3)],['x1','x2'])
df2 = spark.createDataFrame([('a','T'),('b','F'),('d','T')],['x1','x2'])
df1.show();df2.show()
+---+---+
| x1| x2|
+---+---+
| a| 1|
| b| 2|
| c| 3|
+---+---+
+---+---+
| x1| x2|
+---+---+
| a| T|
| b| F|
| d| T|
+---+---+
# union 并集;intersect 交集 ;substract 减集
print('union:')
df1.union(df2).orderBy('x1', ascending=True).show()
# 输出
union:
+---+---+
| x1| x2|
+---+---+
| a| 1|
| a| T|
| b| 2|
| b| F|
| c| 3|
| d| T|
+---+---+
print('intersect:')
df1.intersect(df2).orderBy('x1', ascending=True).show()
# 输出
intersect:
+---+---+
| x1| x2|
+---+---+
+---+---+
print('subtract:')
df1.union(df2).subtract(df1).orderBy('x1', ascending=True).show()
# 输出
subtract:
+---+---+
| x1| x2|
+---+---+
| a| T|
| b| F|
| d| T|
+---+---+
# (1) python eval
'''eval()是python用来执行一个字符串表达式,并返回表达式的值的函数,可以将很复杂的spark操作进行提前封装'''
# 求取列最大值的列名
# 这里先利用greatest
import pyspark.sql.functions as f
cond = "f.when" + ".when".join(["(f.col('" + c + "') == f.col('max_value'), f.lit('" + c + "'))" for c in df.columns])
df.withColumn("max_value", f.greatest(*df.columns))\
.withColumn("MAX", eval(cond))\
.show()
# 分组计算1
df.groupBy('step_id').count().show()
# 分组计算2:agg
unit_qty = df.groupBy(['step_id','equip_id']).agg({'unit_id':'count'})
# 分组计算3:应用多函数
import pyspark.sql.functions as F
unit_qty = df.groupBy(['step_id','equip_id']).agg(F.countDistinct('unit_id').alias('unit_qty'),
F.mean('label').alias('ratio'))
unit_qty.show()
+-------+--------+--------+-------------------+
|step_id|equip_id|unit_qty| ratio|
+-------+--------+--------+-------------------+
| 52000|C2PAS200| 1| 0.1917808219178082|
| 52000|C2PAS100| 1|0.05714285714285714|
| 51106|C2SRP100| 3|0.16039279869067102|
| 51000|C2PIL100| 1| 0.140084388185654|
| 53000|C2ODF100| 1|0.14112554112554113|
+-------+--------+--------+-------------------+
除了原生操作外,pyspark的groupBy可以与pandas结合进行更加复杂的运算。
有两种语言模式,第一种是利用pandas_udf和apply()或者agg(),
from pyspark.sql.fuctions import pandas_udf,PandasUDFType
df = spark.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
("id", "v"))
## GROUPED_MAP&apply
@pandas_udf('id long,v double,c double',PandasUDFType.GROUPED_MAP)
def sum_udf(pdf):
v = pdf.v
return pdf.assign(c=v.sum())
df.groupby("id").apply(sum_udf).show()
## 输出
+---+----+----+
| id| v| c|
+---+----+----+
| 1| 1.0| 3.0|
| 1| 2.0| 3.0|
| 2| 3.0|18.0|
| 2| 5.0|18.0|
| 2|10.0|18.0|
+---+----+----+
## 此种方法schema较长时可以
## GROUPED_AGG&agg
@pandas_udf('double',PandasUDFType.GROUPED_AGG)
def mean_udf(v):
return v.mean()
df.groupby("id").agg(mean_udf(df['v']).alias('c')).show()
## 输出
+---+---+
| id| c|
+---+---+
| 1|1.5|
| 2|6.0|
+---+---+
## Window 类似pandas transform可以分组计算添加列
from pyspark.sql import Window
w = Window \
.partitionBy('id') \
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()
## 输出
# +---+----+------+
# | id| v|mean_v|
# +---+----+------+
# | 1| 1.0| 1.5|
# | 1| 2.0| 1.5|
# | 2| 3.0| 6.0|
# | 2| 5.0| 6.0|
# | 2|10.0| 6.0|
# +---+----+------+
# reference:https://www.cnblogs.com/wkang/p/10255043.html
另一种是注册函数结合applyInPandas进行,个人而言更加偏向此种语法结构:
import pandas as pd
from pyspark.sql.functions import pandas_udf
df = df.withColumn("label",df["label"].cast("double"))
df.printSchema()
def eqp_label(pdf):
label = pdf.label
step_id = pdf.step_id
equip_id = pdf.equip_id
return pdf.assign(step_id=step_id,step_ng=label.max(),equip_step=equip_id.nunique())
ng_all = df.select(
'step_id','equip_id','label'
).groupBy("step_id").applyInPandas(eqp_label,schema="step_id string,\
equip_id string,\
label double,\
step_ng double,\
equip_step double")
## 利用上述语法,schema无法丢弃原数据列,其实只要返回的是DataFrame都可以利用applyInPandas
def choose(pdf):
equip = pdf.equip_id
Nm = pdf.groupby(['step_id',equip]).apply(lambda x:x['glass_id'].nunique())
Nm = pd.DataFrame(Nm).reset_index()
Nm.columns = ['step_id','equip_id','nn']
return Nm
dattmp.groupBy('step_id').applyInPandas(choose,schema="step_id string,\
equip_id string,\
nn double").show()
+-------+--------+------+
|step_id|equip_id| nn|
+-------+--------+------+
| 53000|C2ODF100|1155.0|
| 51106|C2SRP100| 611.0|
| 52000|C2PAS100| 455.0|
| 52000|C2PAS200| 730.0|
| 51000|C2PIL100|1185.0|
+-------+--------+------+
## 注内部嵌套还可以使用自定义函数,由此可以完成更加复杂的操作,可以自定义传参。
def datfilter(self,dat,leaf,label):# 此函数无需细究,完全利用pandas
Nm=dat.groupby(leaf)[self.glass].count()
Nm=Nm/Nm.sum()
dat.loc[:,label]=1*(dat[self.label]>self.lev)
bNm=dat.groupby(leaf)[label].sum()
bNm=bNm/bNm.sum()
Nm=Nm[(Nm>=0.1)|(bNm==1)].reset_index()
retdat=pd.Series([np.nan,np.nan],index=['value','IV'])
if Nm.shape[0]>1:
datmp=pd.merge(dat,Nm[[leaf]],on=leaf,how='inner')
retdat=self.chi2IV(datmp,leaf,label)
return retdat
@staticmethod
def choose(self,pdf):
step = pdf.step_id
equip = pdf.equip_new
label = pdf.label
datfilter = self.datfilter
result = pdf.groupby(step).apply(datfilter,equip,label)
result = pd.DataFrame(result).reset_index()
result.columns = ['step_id','value','IV']
return result
stepdat = datuse.groupBy(self.step).applyInPandas(self.choose,schema="step_id long,\
p_value float,\
IV float")
**需要注意的是: 在类中使用自定函数,需将函数定义为静态函数, 在函数前添加@staticmethod装饰器即可 **
数据下载到本地
df_list = df.collect()
df_list = [row.__getitem__() for row i df__list]