最近准备对推荐系统进行优化,在学习spark的mllib的代码的时候,发现里面很多的参数都是使用numpy。于是有了这篇博客,详细解释了python的机器学习的三剑客,我后面也是想把数据用图表的形式展示出来,这样就比较形象了。
Numpy 是python语言的一个扩展程序库。支持高级大量的维度数组与矩阵运算,此外也针对数组运算提供了大量的数学函数库。Numpy内部解除了python的GIL,运算效率极好,是大量机器学习框架的基础库
import numpy as np
a=[1,2,3,4]
b=np.array(b)
查看数组元素个数
b.size
查看数组形状
b.shape
查看数组维度
b.ndim
查看数组元素类型
b.dtype
创建10行10列的数值为浮点1的矩阵
array_one=np.ones([10,10])
创建10行10列的数值为浮点0的矩阵
array_zero=np.zeros([10,10])
np.random.normal(1.75,0.1,(2,3))
arr=np.random.normal(1.75,0.1,(4,5))
print arr
after_arr=arr[1:3,2:4]
print after_arr
one_20=np.ones([20])
print one_20
one_4_5=one_20.reshape([4,5])
print one_4_5
stus_score=np.array([[80,88],[82,81],[84,75],[86,83],[75,81]])
stus_score>80
np.where(stus_score<80,0,90)
print np.amax(stus_score,axis=0)
print result
print np.amax(stus_score,axis=1)
print result
result=np.amin(stus_score,axis=0)
print result
result=np.amin(stus_score,axis=1)
print result
result=np.mean(stus_score,axis=0)
print result
result=np.mean(stus_score,axis=1)
print result
result=np.std(stus_score,axis=0)
print result
result=np.std(stus_score,axis=1)
stus_score[:,0]=stus_score[:,0]+5
print stus_score
stus_score[:,0]=stus_score[:,0]*5
print stus_score
(M行, N列) * (N行, Z列) = (M行, Z列)
q = np.array([[0.4], [0.6]])
result = np.dot(stut_score, q)
print result
v1 = [[0, 1, 2, 3, 4, 5],
[6, 7, 8, 9, 10, 11]]
v2 = [[12, 13, 14, 15, 16, 17],
[18, 19, 20, 21, 22, 23]]
result=np.vstack((v1,v2))
print result
v1 = [[0, 1, 2, 3, 4, 5],
[6, 7, 8, 9, 10, 11]]
v2 = [[12, 13, 14, 15, 16, 17],
[18, 19, 20, 21, 22, 23]]
result=np.hstack((v1,v2))
print result
result=np.getfromtxt(“./students_score.csv”,delimiter=“,”)
pandas是基于numpy开发出的,专门用于数据分析的开源python库
import numpy as np
import pandas as pd
print pd.Series(np.arange(4,10))
pd.Series([11,12,14],index=[“北京”,”上海”,”深圳”])
pd.Series({“北京”:11,”上海”:12,”深圳”:14})
data_3_4=pd.DataFrame(np.arange(10,22).reshape(3,4))
print data_3_4
print(data_3_4[:1])
print(data_3_4[:][0])
# 创建一个3行4列的DataFrame类型数据
data_3_4 = pd.DataFrame(np.arange(10, 22).reshape(3, 4))
# 打印数据
print(data_3_4)
# 打印第一行数据
print(data_3_4[:1])
# 打印第一列数据
print(data_3_4[:][0])
# 读取数据
result = pd.read_csv("./students_score.csv")
# 数据的形状
result.shape
# 每列数据的 类型信息
result.dtypes
# 数据的维数
result.ndim
# 数据的索引(起/始/步长)
result.index
# 打印每一列 属性的名称
result.columns
# 将数据放到数组中显示
result.values
print("-->前5个:")
print(result.head(5))
# 打印后5个
print("-->后5个:")
print(result.tail(5))
# 打印描述信息(实验中好用)
print("-->描述信息:")
print(result.describe())
pandas.read_csv(filepath,sep=“,”,names=None,usecols=None)
返回的类型:DataFrame
result[‘姓名’][0:6]
result[result[‘age’]>23]
IMDB_1000 = pd.read_csv("./IMDB-Movie-Data.csv")
# 获取数据字段
print(IMDB_1000.dtypes)
# 根据1000部电影评分进行降序排列,参数ascending, 默认为True(升序), 这里为False(降序)
IMDB_1000.sort_values(by="Rating", ascending=False)
# 时间最长的电影
IMDB_1000[IMDB_1000["Runtime (Minutes)"]==IMDB_1000["Runtime (Minutes)"].max()]
# 时间最短的电影
IMDB_1000[IMDB_1000["Runtime (Minutes)"]==IMDB_1000["Runtime (Minutes)"].min()]
# 电影时长平均值
IMDB_1000["Runtime (Minutes)"].mean()
# 删除存在缺失值的样本
IMDB_1000.dropna()
# 为一些电影缺失的总票房添加平均值
IMDB_1000["Revenue (Millions)"].fillna(IMDB_1000["Revenue (Millions)"].mean(), inplace=True)
# 在线读取数据,并按照说明文档, 并对各列信息进行命名
bcw = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape", "Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses","Class:"])
#预处理,把数据中的?替换为np.nan
bcw=bcw.replace(to_replace=“?”,value=np.nan)
# 读取前10行数据
train = pd.read_csv("./train.csv", nrows = 10)
# 将数据中的time转换为最小分度值为秒(s)的计量单位
train["time"] = pd.to_datetime(train["time"], unit="s")
# 新增列year, month, weekday
train["year"] = pd.DatetimeIndex(train["time"]).year
train["month"] = pd.DatetimeIndex(train["time"]).month
train["weekday"] = pd.DatetimeIndex(train["time"]).weekday
# 读取3张表
user_info = pd.read_csv("./user_info.csv")
order_info = pd.read_csv("./order_info.csv")
goods_info = pd.read_csv("./goods_info.csv")
# 合并三张表
u_o = pd.merge(user_info, order_info, how="left", on=["user_id", "user_id"])
u_o_g = pd.merge(u_o, goods_info, how="left", on=["goods_name", "goods_name"])
# 交叉表, 表示出用户姓名,和商品名之间的关系
user_goods = pd.crosstab(u_o_g["姓名"],u_o_g["goods_name"])
starbucks = pd.read_csv("./directory.csv")
# 统计每个国家星巴克的数量
starbucks.groupby(["Country"]).count()
# 统计每个国家 每个省份 星巴克的数量
starbucks.groupby(["Country", "State/Province"]).count()
matplotlib 是python 2D绘图领域的基础套件,它让使用者将数据图形化,并提供多样化的输出格式。这里讲会以四个小案例探索matplotlib的常见用法
import matplotlib.pyplot as plt
import random
# plt.plot([1, 2, 3, 4])
# plt.ylabel("some numbers")
# plt.show()
#
# plt.plot([1, 2, 3, 4], [1, 4, 9, 16], 'ro')
# plt.show()
beijing_x = [_ for _ in range(0, 24)]
beijing_y = [random.randint(10, 30) for _ in range(0, 24)]
plt.plot(beijing_x, beijing_y, label="beijing")
shanghai_x = [_ for _ in range(0, 24)]
shanghai_y = [random.randint(10, 20) for _ in range(0, 24)]
plt.plot(shanghai_x, shanghai_y, label="shanghai")
hefei_x = [_ for _ in range(0, 24)]
hefei_y = [random.randint(30, 40) for _ in range(0, 24)]
plt.plot(hefei_x, hefei_y, label="hefei", color="#823384", linestyle=":", linewidth=3, alpha=0.3)
##坐标轴
x_ = [x_ for x_ in range(24)]
x_desc = ["{}h".format(_) for _ in x_]
plt.xticks(x_, x_desc)
y_ = [_ for _ in range(50)][::2]
y_desc = ["{}c".format(_) for _ in y_]
plt.yticks(y_, y_desc)
plt.xlabel("time")
plt.ylabel("temperature")
plt.title("the temperature change in one day")
plt.legend(loc="best")
plt.show()
import matplotlib.pyplot as plt
import random
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# 条形图绘制名侦探柯南主要角色年龄
role_list = ["michael", "sdsds", "sdasd", "ffff", "gggg", "bbb", "nnn", "lll"]
role_age = [7, 17, 7, 34, 32, 30, 27, 46]
# 实际年龄
role_ture_age = [18, 17, 18, 34, 45, 30, 27, 46]
x = [i + 1 for i, role in enumerate(role_list)]
y = role_age
y2 = role_ture_age
plt.figure(figsize=(15, 8), dpi=100)
plt.bar(x, y, width=-0.4, label="role age", color="#509839")
plt.bar(x, y2, width=0.3, label="role real age", color="#c03035")
x_desc = [_ for _ in role_list]
plt.xticks(x, x_desc)
y = range(50)[::5]
plt.yticks(y)
plt.xlabel("role")
plt.ylabel("age")
plt.title("the role in cartoon Detective conan")
plt.legend(loc="best")
plt.show()