# encoding=utf-8
# 移除重复数据
import pandas as pd
import numpy as np
data = pd.DataFrame({
'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]
})
# print 'data:=\n', data
# print 'data.duplicated():=\n', data.duplicated()
# print 'data.drop_duplicates():=\n', data.drop_duplicates()
#
data['v1'] = range(7)
# print data
# print 'data[\'v1\']:=\n', data['v1']
# print 'data.drop_duplicates([\'k1\']):=\n', data.drop_duplicates(['k1'])
# print 'data.drop_duplicates([\'k1\', \'k2\']):=\n', data.drop_duplicates(['k1', 'k2'])
### 利用函数或映射进行数据转换
# 1
data = pd.DataFrame({
'food': ['bacon', 'pulled port', 'bacon', 'pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]
})
# print 'data:=\n', data
meat_to_animal = {
'bacon': 'pig',
'pulled port': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
# print 'data:=\n', data
# print data['food'].map(lambda x: meat_to_animal[x.lower()])
### 数据标准化
# 参数初始化
datafile = 'd:/data/normalization_data.xls'
# 读取数据
data = pd.read_excel(datafile, header=None)
print 'data:=\n', data
print 'data.max():=\n', data.max()
# 最小-最大规范化
# print '(data-data.min())/(data.max()-data.min()):=\n', (data - data.min()) / (data.max() - data.min())
# 零-均值 规范化
# print '(data - data.mean()) / data.std():=\n', (data - data.mean()) / data.std()
# 小数定标规范化
data / 10 ** np.ceil(np.log10(data.abs().max()))
### 替换值
data = pd.Series([1., -999., 2., -999., -1000., 3.])
print 'data:=\n', data
# print data.replace(-999, np.nan)
# print data.replace([-999, -1000], np.nan)
# print data.replace([-999, -1000], [np.nan, 0])
### 重命名轴索引
data = pd.DataFrame(
np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four']
)
print 'data.index.map(str.upper):=\n', data.index.map(str.upper)
data.index = data.index.map(str.upper)
print data
data.rename(index=str.title, columns=str.upper)
print data
print 'data.rename\n', data.rename(
index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'}
)
# 总是返回DataFrame 的引用
_ = data.rename(
index={'OHIO': 'INDIANA'},
inplace=True
)