python数据处理之6.3

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


pd.set_option('display.width', 500)#设置输出一行中显示的数量
data = pd.read_excel(r'C:\Users\Lenovo\Desktop\sales.xlsx', sheet_name='sheet1', header=0) #sheet_name输出Excel哪个表中的信息,header=0表示这个表中第一行有头部信息
print('data.head() = \n', data.head())#默认读取文件的前5行信息,也可以在括号中设置行数,例如print('data.head() = \n', data.head(10))
print('data.tail() = \n', data.tail())#表示读取后面几行的信息,默认为5行,也可以设置,同上
print('data.dtypes = \n', data.dtypes)#输出每列数据的数据类型
print(type(data))#生成的数据为二维表DataFrame
print('data.columns = \n', data.columns)#输出的是每一列的列名
for c in data.columns:
    print(c, end=' ')
data['total'] = data['Jan'] + data['Feb'] + data['Mar']#将后三列'Jan', 'Feb', 'Mar'相加,得到新的一列'total'
print('data.head() = \n', data.head())#检查一下,看是否多出total一列,经检查,多出total一列,并且值为三列的和
print(data['Jan'].sum())#计算'Jan'列的总和,后面三个分别为计算'Jan'列的最小值,最大值,均值
print(data['Jan'].min())
print(data['Jan'].max())
print(data['Jan'].mean())
print(data.describe())#对数据为数字的列做一个简单的描述统计,里面的内容有每一列的个数(正常的),均值,标准差,最小值,下四分位数,中位数,上四分位数,最大值
print(data[['Jan', 'Feb', 'Mar']].describe())#对'Jan', 'Feb', 'Mar'三列做描述性统计,统计的内容同上
s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()# 添加一行
print(s1)
s2=pd.DataFrame(data=s1)#将S1算出的四个数据和'Jan', 'Feb', 'Mar', 'total'生成一个二维表
print(s2)
print(s2.T)#将s2进行转置,为了将算出的4个数据放入excel中
print(s2.T.reindex(columns=data.columns))#将Excel表中的列全表示出来,有值的赋值,没值的用NaN填充
#即(以下三行代码相当于从s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()到print(s2.T.reindex(columns=data.columns))这几行的内容)
s=pd.DataFrame(data=data[['Jan','Feb','Mar','total']].sum()).T
s=s.reindex(columns=data.columns,fill_value=0)#用0填充
print(s)
data=data.append(s,ignore_index=True)#将最后一行添加到全部数据中,ignore_index默认为False,如果为True,则表示不使用ignore标签
data=data.rename(index={15:'Total'})#将索引为15的行重新命名,名字为Total
print(data.tail())#输出后几行。默认为5行
print('==============apply的使用==========')#apply可对数据的行和列分别做处理


def enum_row(row):
    print(row['state'])#输出state这列的值
data.apply(enum_row, axis=1)#axis=0表示对行处理(默认),axis=1表示对列处理


state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU",
                 "KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI",
                 "NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID",
                 "FEDERATED STATES OF MICRONESIA": "FM",
                 "Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL",
                 "Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT",
                 "MASSACHUSETTS": "MA",
                 "PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD",
                 "NEW MEXICO": "NM",
                 "MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO",
                 "Armed Forces Middle East": "AE",
                 "NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA",
                 "MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI",
                 "MARSHALL ISLANDS": "MH",
                 "WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV",
                 "LOUISIANA": "LA",
                 "NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI",
                 "NORTH DAKOTA": "ND",
                 "Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY",
                 "RHODE ISLAND": "RI",
                 "DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}#给洲的名字做简称
states=state_to_code.keys()#将state_to_code的键值赋给states
print(fuzz.ratio('Python Package', 'PythonPackage'))#输出字符串Python Package和PythonPackage的相似度
print(process.extract('Mississippi', states))#提取states中与Mississippi几个相似的值
print(process.extract('Mississipi',states,limit=1))#提取states中与Mississipi相似的值,提取一个最相似的,limit为提取的个数
print(process.extractOne('Mississipi',states))#同上,提取一个


def find_state_code(row):
    if row['state'] != 0:
        print(process.extractOne(row['state'], states, score_cutoff=80))#相似度为80以上的返回一个
data.apply(find_state_code, axis=1)#应用于列上



def capital(str):
    return str.capitalize()#将字符串的第一个字母变为大写,其余为小写
def correct_state(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return ' '.join(map(capital, state_name.split(' ')))#两个大写字母中间用空格分开
    return row['state']
data['state'] = data.apply(correct_state, axis=1)
print('After Correct State:\n', data['state'])

'''加一列State Code(简写)'''
def fill_state_code(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return state_to_code[state_name]
    return ''
data.insert(5, 'State Code', np.nan)#加入到第六列
data['State Code'] = data.apply(fill_state_code, axis=1)
print(data['State Code'])


print('==============group by================')
print(data.groupby('State Code'))#将State Code列中相同的合并
print(data.groupby('State Code').sum())#计算每一个洲上和是多少,重复洲的名字合并了


data.to_excel(r'C:\Users\Lenovo\Desktop.sales.xls', sheet_name='Sheet1', index=False)#执行错误

你可能感兴趣的:(python,python)