python 移除重复数据

# encoding=utf-8
#  移除重复数据
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'k1': ['one'] * 3 + ['two'] * 4,
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
# print 'data:=\n', data
# print 'data.duplicated():=\n', data.duplicated()
# print 'data.drop_duplicates():=\n', data.drop_duplicates()
#
data['v1'] = range(7)
# print data
# print 'data[\'v1\']:=\n', data['v1']
# print 'data.drop_duplicates([\'k1\']):=\n', data.drop_duplicates(['k1'])
# print 'data.drop_duplicates([\'k1\', \'k2\']):=\n', data.drop_duplicates(['k1', 'k2'])

### 利用函数或映射进行数据转换
# 1
data = pd.DataFrame({
    'food': ['bacon', 'pulled port', 'bacon', 'pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]
})
# print 'data:=\n', data

meat_to_animal = {
    'bacon': 'pig',
    'pulled port': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
# print 'data:=\n', data
# print data['food'].map(lambda x: meat_to_animal[x.lower()])
### 数据标准化
#  参数初始化
datafile = 'd:/data/normalization_data.xls'
#  读取数据
data = pd.read_excel(datafile, header=None)
print 'data:=\n', data
print 'data.max():=\n', data.max()
#  最小-最大规范化
# print '(data-data.min())/(data.max()-data.min()):=\n', (data - data.min()) / (data.max() - data.min())
#  零-均值 规范化
# print '(data - data.mean()) / data.std():=\n', (data - data.mean()) / data.std()
#  小数定标规范化
data / 10 ** np.ceil(np.log10(data.abs().max()))

### 替换值
data = pd.Series([1., -999., 2., -999., -1000., 3.])
print 'data:=\n', data
# print data.replace(-999, np.nan)
# print data.replace([-999, -1000], np.nan)
# print data.replace([-999, -1000], [np.nan, 0])

### 重命名轴索引
data = pd.DataFrame(
    np.arange(12).reshape((3, 4)),
    index=['Ohio', 'Colorado', 'New York'],
    columns=['one', 'two', 'three', 'four']
)
print 'data.index.map(str.upper):=\n', data.index.map(str.upper)
data.index = data.index.map(str.upper)
print data
data.rename(index=str.title, columns=str.upper)
print data
print 'data.rename\n', data.rename(
    index={'OHIO': 'INDIANA'},
    columns={'three': 'peekaboo'}
)
#  总是返回DataFrame 的引用
_ = data.rename(
    index={'OHIO': 'INDIANA'},
    inplace=True
)




你可能感兴趣的:(python 移除重复数据)