一、简介
涵盖导入和导出各种格式数据的基本知识,还包括清理数据的方式,比如值的归一化处理、缺失数据的添加、实时数据检查以及一些类似的技巧,以便正确地准备数据来进行可视化
二、从CSV文件导入数据
以逗号分隔的值(文件中还包括一个文件头,也是以逗号分隔的)
#!/usr/bin/env python
import csv
filename = 'ch02-data.csv'
data = []
try:
with open(filename) as f:
reader = csv.reader(f)
c = 0
for row in reader:
if c == 0:
header = row
else:
data.append(row)
c += 1
except csv.Error as e:
print "Error reading CSV file at line %s: %s" % (reader.line_num, e)
sys.exit(-1)
if header:
print header
print '=================='
for datarow in data:
print datarow
如果处理大数据文件可以用numpy.loadtxt
import numpy
data = numpy.loadtxt('ch02-data.csv',dtype='string',delimiter=',')
三、从Microsoft Excel文件中导入数据
通常做法是把数据从Excel中导出到CSV格式的文件中
PYTHON中xlrd模块 ( pip install xlrd)
#!/usr/bin/env python
import xlrd
file='ch02-xlsxdata.xlsx'
wb = xlrd.open_workbook(filename=file)
ws = wb.sheet_by_name('Sheet1')
dataset = []
for r in range(ws.nrows):
col = []
for c in range(ws.ncols):
col.append(ws.cell(r,c).value)
dataset.append(col)
from pprint import pprint
pprint(dataset)
四、从定宽数据文件导入数据
文件中的字段是有固定宽度的
可用Python中的struct模块
import struct
import string
datafile = 'ch02-fixed-width-1M.data'
mask='9s14s5s'
with open(datafile, 'r') as f:
for line in f:
fields = struct.Struct(mask).unpack_from(line)
print 'fields: ', [field.strip() for field in fields] #strip()可以去掉每个字段的前导和后导空格
它可能导出自Excel文件,也可能是一些定制软件的输出
可以用CSV模块
import csv
filename = 'ch02-data.tab'
data = []
try:
with open(filename) as f:
reader = csv.reader(f, dialect=csv.excel_tab)
c = 0
for row in reader:
if c == 0:
header = row
else:
data.append(row)
c += 1
except csv.Error as e:
print "Error reading CSV file at line %s: %s" % (reader.line_num, e)
sys.exit(-1)
if header:
print header
print '==================='
for datarow in data:
print datarow
六、从JSON数据源导入数据
JavaScript Object Notation(JSON)作为一种平台无关的格式被广泛地应用于系统间或者应用间的数据交换
使用requests模块获取数据
七、导出数据到JSON、CSV和Excel
写入数据
实例:
八、从数据库导入数据
使用SQL drivers
九、清理异常值
is_outlier
十、读取大块数据文件
十一、读取流数据源
import os
十二、导入图像数据到NumPy数组
接下来会介绍如何用NumPy和SciPy这两种Python库来做图像处理
import scipy.misc
import matplotlib.pyplot as plt
lena = scipy.misc.lena()
plt.gray()
plt.imshow(lena)
plt.colorbar()
plt.show()
十三、生成可控的随机数据集合
可以用假数据来了解统计方法是不是能够得到我们想要的模型。因为已经预先知道了模型,所以我们可以把统计方法应用到已知数据上进行验证。在真实场景下,我们是没办法做到这一点的,因为我们必须要估计到,总会有一定程度的不确定性因素存在,可能导致错误的结果。
使用random模块
#均匀分布
import pylab
import random
SAMPLE_SIZE = 10000
random.seed() # seed random generator
real_rand_vars=[]
real_rand_vars=[random.random() for val in xrange(SAMPLE_SIZE)]
pylab.hist(real_rand_vars, 10) # create histogram from data in 10 buckets
pylab.xlabel("Number range")
pylab.ylabel("Count")
pylab.show()
# 加入高斯噪声
import pylab
import random
duration=100 #
mean_inc = 0.2 #mean value
std_dev_inc = 1.2 # standard_deviation
x = range(duration)
y = []
price_today = 0
for i in x:
next_delta = random.normalvariate(mean_inc, std_dev_inc)
price_today += next_delta
y.append(price_today)
pylab.plot(x, y)
pylab.xlabel("Time")
pylab.ylabel("Value")
pylab.show()
如果想要更多的控制,可以使用不同的分布
# coding: utf-8
import random
import matplotlib
import matplotlib.pyplot as plt
SAMPLE_SIZE = 1000
# histogram buckets
buckets = 100
plt.figure()
# we need to update font size just for this example
matplotlib.rcParams.update({'font.size': 7})
plt.subplot(621)
plt.xlabel("random.random")
# Return the next random floating point number in the range [0.0, 1.0).
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.random())
plt.hist(res, buckets)
plt.subplot(622)
plt.xlabel("random.uniform")
# Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.
# The end-point value b may or may not be included in the range depending on floating-point rounding in the equation a + (b-a) * random().
a = 1
b = SAMPLE_SIZE
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.uniform(a, b))
plt.hist(res, buckets)
plt.subplot(623)
plt.xlabel("random.triangular")
# Return a random floating point number N such that low <= N <= high and with the specified mode between those bounds. The low and high bounds default to zero and one. The mode argument defaults to the midpoint between the bounds, giving a symmetric distribution.
low = 1
high = SAMPLE_SIZE
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.triangular(low, high))
plt.hist(res, buckets)
plt.subplot(624)
plt.xlabel("random.betavariate")
# Beta distribution. Conditions on the parameters are alpha > 0 and beta > 0. Returned values range between 0 and 1.
alpha = 1
beta = 10
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.betavariate(alpha, beta))
plt.hist(res, buckets)
plt.subplot(625)
plt.xlabel("random.expovariate")
# Exponential distribution. lambd is 1.0 divided by the desired mean. It should be nonzero. (The parameter would be called “lambda”, but that is a reserved word in Python.) Returned values range from 0 to positive infinity if lambd is positive, and from negative infinity to 0 if lambd is negative.
lambd = 1.0 / ((SAMPLE_SIZE + 1) / 2.)
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.expovariate(lambd))
plt.hist(res, buckets)
plt.subplot(626)
plt.xlabel("random.gammavariate")
# Gamma distribution. (Not the gamma function!) Conditions on the parameters are alpha > 0 and beta > 0.
# The probability distribution function is:
#
# x ** (alpha - 1) * math.exp(-x / beta)
# pdf(x) = --------------------------------------
# math.gamma(alpha) * beta ** alpha
alpha = 1
beta = 10
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.gammavariate(alpha, beta))
plt.hist(res, buckets)
plt.subplot(627)
plt.xlabel("random.lognormvariate")
# Log normal distribution. If you take the natural logarithm of this distribution, you’ll get a normal distribution with mean mu and standard deviation sigma. mu can have any value, and sigma must be greater than zero.
mu = 1
sigma = 0.5
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.lognormvariate(mu, sigma))
plt.hist(res, buckets)
plt.subplot(628)
plt.xlabel("random.normalvariate")
# Normal distribution. mu is the mean, and sigma is the standard deviation.
mu = 1
sigma = 0.5
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.normalvariate(mu, sigma))
plt.hist(res, buckets)
plt.subplot(629)
plt.xlabel("random.paretovariate")
# Pareto distribution. alpha is the shape parameter.
alpha = 1
res = []
for _ in xrange(1, SAMPLE_SIZE):
res.append(random.paretovariate(alpha))
plt.hist(res, buckets)
plt.tight_layout()
plt.show()
十四、真实数据的噪声平滑处理
清理真实数据源的数据,这些算法在信号处理领域很有名
基础算法是基于滚动窗口模式(例如卷积)
from pylab import *
from numpy import *
def moving_average(interval, window_size):
'''
Compute convoluted window for given size
'''
window = ones(int(window_size)) / float(window_size)
return convolve(interval, window, 'same')
t = linspace(-4, 4, 100)
y = sin(t) + randn(len(t))*0.1
plot(t, y, "k.")
# compute moving average
y_av = moving_average(y, 10)
plot(t, y_av,"r")
#xlim(0,1000)
xlabel("Time")
ylabel("Value")
grid(True)
show()