http://www.aboutyun.com/thread-18150-1-1.html
PySpark简介
1
2
3
4
|
from
pyspark
import
SparkContext
sc
=
SparkContext(
"local"
,
"Job Name"
, pyFiles
=
[
'MyFile.py'
,
'lib.zip'
,
'app.egg'
])
words
=
sc.textFile(
"/usr/share/dict/words"
)
words.
filter
(
lambda
w:w.startswith(
"spar"
)).take(
5
)
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
#加载HDFS上面的用户数据
user_data = sc.textFile("hdfs:/input/ml-100k/u.user")
#打印加载的用户信息第一条
user_data.first()
#用"|"分割符分割每一行的数据,然后将数据返回到user_fields
user_fields = user_data.map(lambda line: line.split("|"))
#统计总的用户数
num_users = user_fields.map(lambda fields: fields[0]).count()
#统计性别的种类数,distinct()函数用来去重。
num_genders = user_fields.map(lambda fields:fields[2]).distinct().count()
#统计职位种类数
num_occupations = user_fields.map(lambda fields:fields[3]).distinct().count()
#统计邮政编码种类数
num_zipcodes = user_fields.map(lambda fields:fields[4]).distinct().count()
#打印统计的这些信息
print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders, num_occupations, num_zipcodes)
#统计用户年龄
ages = user_fields.map(lambda x: int(x[1])).collect()
#通过python中的matplotlib生成图表提供给分析师分析
import matplotlib.pyplot as plt
hist(ages, bins=20, color='lightblue', normed=True)
fig = plt.gcf()
fig.set_size_inches(16, 10)
plt.show()
|
1
|
.
/bin/pyspark
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
#处理职位那一列,通过类似于MapReduce经典例子WordCount处理过程处理职位
count_by_occupation
=
user_fields.
map
(
lambda
fields: (fields[
3
],
1
)).reduceByKey(
lambda
x, y: x
+
y).collect()
#导入numpy模块
import
numpy as np
#获取用户职位,并作为柱状图的x轴数据显示
x_axis1
=
np.array([c[
0
]
for
c
in
count_by_occupation])
#获取用户的各个职位数,并作为y轴数据显示
y_axis1
=
np.array([c[
1
]
for
c
in
count_by_occupation])
#让x轴类别的显示按照y轴中每种职位的个数升序排序
x_axis
=
x_axis1[np.argsort(y_axis1)]
#y轴也是升序
y_axis
=
y_axis1[np.argsort(y_axis1)]
#设置柱状图中x轴范围以及width
pos
=
np.arange(
len
(x_axis))
width
=
1.0
#将统计的职位信息使用matplotlib生成柱状图
from
matplotlib
import
pyplot as plt
ax
=
plt.axes()
ax.set_xticks(pos
+
(width
/
2
))
ax.set_xticklabels(x_axis)
plt.bar(pos, y_axis, width, color
=
'lightblue'
)
plt.xticks(rotation
=
30
)
fig
=
plt.gcf()
fig.set_size_inches(
16
,
10
)
plt.show()
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#从HDFS中加载u.item数据
movie_data
=
sc.textFile(
"hdfs:/input/ml-100k/u.item"
)
#打印第一条数据,查看数据格式
print
movie_data.first()
#统计电影总数
num_movies
=
movie_data.count()
print
"Movies: %d"
%
num_movies
#定义函数功能为对电影数据预处理,对于错误的年限,使用1900填补
def
convert_year(x):
try
:
return
int
(x[
-
4
:])
except
:
return
1900
# there is a 'bad' data point with a blank year,which we set to 900 and will filter out later
#使用"|"分隔符分割每行数据
movie_fields
=
movie_data.
map
(
lambda
lines: lines.split(
"|"
))
#提取分割后电影发布年限信息,并做脏数据预处理
years
=
movie_fields.
map
(
lambda
fields: fields[
2
]).
map
(
lambda
x:convert_year(x))
#获取那些年限为1900的电影(部分为脏数据)
years_filtered
=
years.
filter
(
lambda
x: x !
=
1900
)
#计算出电影发布时间与1998年的年限差
movie_ages
=
years_filtered.
map
(
lambda
yr:
1998
-
yr).countByValue()
#将年限差作为x轴,电影数量作为y轴作柱状图
values
=
movie_ages.values()
bins
=
movie_ages.keys()
from
matplotlib
import
pyplot as plt1
plt1.hist(values, bins
=
bins, color
=
'lightblue'
, normed
=
True
)
fig
=
plt1.gcf()
fig.set_size_inches(
16
,
10
)
plt1.show()
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#从HDFS上面加载用户评分数据
rating_data
=
sc.textFile(
"hdfs:/input/ml-100k/u.data"
)
print
rating_data.first()
#统计评分记录总数
num_ratings
=
rating_data.count()
print
"Ratings: %d"
%
num_ratings
#使用"\t"符分割每行数据
rating_data
=
rating_data.
map
(
lambda
line: line.split(
"\t"
))
#获取每条数据中的用户评分数集合
ratings
=
rating_data.
map
(
lambda
fields:
int
(fields[
2
]))
#获取最大评分数
max_rating
=
ratings.
reduce
(
lambda
x, y:
max
(x, y))
#获取最小评分数
min_rating
=
ratings.
reduce
(
lambda
x, y:
min
(x, y))
#获取平均评分数
mean_rating
=
ratings.
reduce
(
lambda
x, y: x
+
y)
/
num_ratings
#获取评分中位数
median_rating
=
np.median(ratings.collect())
#每位用户平均评分
ratings_per_user
=
num_ratings
/
num_users
#每位用户评了几场电影
ratings_per_movie
=
num_ratings
/
num_movies
#打印上面这些信息
print
"Min rating: %d"
%
min_rating
print
"Max rating: %d"
%
max_rating
print
"Average rating: %2.2f"
%
mean_rating
print
"Median rating: %d"
%
median_rating
print
"Average # of ratings per user: %2.2f"
%
ratings_per_user
print
"Average # of ratings per movie: %2.2f"
%
ratings_per_movie
#获取评分数据
count_by_rating
=
ratings.countByValue()
import
numpy as np
#x轴的显示每个评分(1-5)
x_axis
=
np.array(count_by_rating.keys())
#y轴显示每个评分所占概率,总概率和为1
y_axis
=
np.array([
float
(c)
for
c
in
count_by_rating.values()])
y_axis_normed
=
y_axis
/
y_axis.
sum
()
pos
=
np.arange(
len
(x_axis))
width
=
1.0
#使用matplotlib生成柱状图
from
matplotlib
import
pyplot as plt2
ax
=
plt2.axes()
ax.set_xticks(pos
+
(width
/
2
))
ax.set_xticklabels(x_axis)
plt2.bar(pos, y_axis_normed, width, color
=
'lightblue'
)
plt2.xticks(rotation
=
30
)
fig
=
plt2.gcf()
fig.set_size_inches(
16
,
10
)
plt2.show()
|
01
02
03
04
05
06
07
08
09
10
11
12
13
|
#获取用户评分次数和每次评分
user_ratings_grouped
=
rating_data.
map
(
lambda
fields: (
int
(fields[
0
]),
int
(fields[
2
]))).groupByKey()
#用户ID以及该用户评分总数
user_ratings_byuser
=
user_ratings_grouped.
map
(
lambda
(k, v): (k,
len
(v)))
#打印5条结果
user_ratings_byuser.take(
5
)
#生成柱状图
from
matplotlib
import
pyplot as plt3
user_ratings_byuser_local
=
user_ratings_byuser.
map
(
lambda
(k, v):v).collect()
plt3.hist(user_ratings_byuser_local, bins
=
200
, color
=
'lightblue'
,normed
=
True
)
fig
=
plt3.gcf()
fig.set_size_inches(
16
,
10
)
plt3.show()
|