数据分析基础(一)

# coding: utf-8

# In[1]:


import pandas as pd


# In[4]:


data = pd.read_csv('pokemon.csv')


# In[6]:


data.head()


# In[12]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().magic('matplotlib inline')
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "./"]).decode("utf8"))


# In[13]:


f,ax = plt.subplots(figsize = (18,18))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)


# In[14]:


data.head(10)


# In[15]:


data.info()


# In[16]:


data.columns


# In[19]:


dictionary = {'spain' : 'madrid','usa' : 'vegas'}
print(dictionary.keys())
print(dictionary.values())


# In[20]:


dictionary['spain'] = "barcelona"    # update existing entry
print(dictionary)
dictionary['france'] = "paris"       # Add new entry
print(dictionary)
del dictionary['spain']              # remove entry with key 'spain'
print(dictionary)
print('france' in dictionary)        # check include or not
dictionary.clear()                   # remove all entries in dict
print(dictionary)


# In[23]:


x = data['Defense'] > 200
data[x]


# In[27]:


x = (data['Defense'] > 200) & (data['Attack'] > 100)


# In[28]:


data[x]


# In[29]:


data.shape


# In[35]:


print(data['Type 2'].value_counts(dropna =False))


# In[36]:


print(data['Type 2'].value_counts(dropna =True))


# In[37]:


data.describe() 


# In[38]:


# For example: compare attack of pokemons that are legendary  or not
# Black line at top is max
# Blue line at top is 75%
# Red line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers
data.boxplot(column='Attack',by = 'Legendary')


# In[39]:


data_new = data.head()


# In[164]:


data_new


# In[172]:


data_new[['Attack','Defense']]


# In[171]:


data_new.index = ['A','b','c','e','f']
data_new


# In[41]:


# lets melt
# id_vars = what we do not wish to melt
# value_vars = what we want to melt
melt = pd.melt(frame = data_new,id_vars = 'Name',value_vars=['Attack','Defense'])


# In[42]:


melt


# In[46]:


melt['variable'].value_counts()


# In[48]:


# Index is name
# I want to make that columns are variable
# Finally values in columns are value
melt.pivot(index = 'Name', columns = 'variable',values='value')


# In[56]:


# Firstly lets create 2 data frame
data1 = data.head()
data2= data.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row


# In[59]:


data1 = data['Attack'].head()
data2= data['Defense'].head()


# In[60]:


data1


# In[61]:


data2


# In[62]:


conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col


# In[96]:


a = np.array([[1,2,3],[3,2,1]])
b = np.array([[4,5,6],[6,5,4]])


# In[66]:


x = np.arange(9.).reshape(3, 3)


# In[69]:


y = np.where(x > 5)


# In[70]:


x[y]


# In[72]:


y


# In[73]:


x = np.random.randn(4,4)
print(np.where(x>0,2,-2))


# In[74]:


x


# In[81]:


xarr = np.array([1.1,1.2,1.3,1.4,1.5])
yarr = np.array([2.1,2.2,2.3,2.4,2.5])
zarr = np.array([True,False,True,True,False])
result = [(x if c else y)
          for x,y,c in zip(xarr,yarr,zarr)]


# In[82]:


result = np.where(zarr,xarr,yarr)#和上面类似
print(result)


# In[99]:


np.concatenate([a,b],axis = 1)#对应的行开始拼接


# In[100]:


data.dtypes


# In[101]:


# lets convert object(str) to categorical and int to float.
data['Type 1'] = data['Type 1'].astype('category')
data['Speed'] = data['Speed'].astype('float')


# In[102]:


data.dtypes


# In[103]:


# Lets drop nan values
data1=data   # also we will use data to fill missing value so I assign it to data1 variable
data1["Type 2"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?


# In[106]:


data1["Type 2"].value_counts()


# In[113]:


data['Type 1'].notnull().all() # returns nothing because we drop nan values


# In[128]:


# data frames from dictionary
country = ["Spain","France"]
population = ["11","12"]
list_label = ["country","population"]
list_col = [country,population]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df


# In[117]:


zipped


# In[118]:


data_dict


# In[129]:


a = ((1,2),(3,4))
a = dict(a)


# In[130]:


a


# In[131]:


df["capital"] = ["madrid","paris"]
df


# In[132]:


# Broadcasting
df["income"] = 0 #Broadcasting entire column
df


# In[137]:


# Plotting all data 
data1 = data.loc[:,["Attack","Defense","Speed"]]
data1.plot()


# In[192]:


data1.head()


# In[144]:


data1.iloc[[0]]
data1.loc[[0]]


# In[155]:


data.iloc[0:2,[1,2]]#选取行


# In[158]:


data1.plot(subplots=True)


# In[161]:


data1.plot(kind = "scatter",x="Attack",y = "Defense")


# In[162]:


#hist plot  
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True)


# In[163]:


# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[0])
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[1],cumulative = True)


# In[173]:


time_list = ["1992-03-08","1992-04-12"]
print(type(time_list[1])) # As you can see date is string
# however we want it to be datetime object
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))


# In[174]:


# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()


# In[175]:


data2


# In[185]:


# In order to practice lets take head of pokemon data and add it a time list
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2


# In[183]:


date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
#data2["date"] = datetime_object
# lets make date as index
data2.index = datetime_object
data2.index.name = "date"


# In[184]:


data2


# In[196]:


print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])


# In[200]:


data2


# In[199]:


data2.resample('A').mean()  #Needs string to specify frequency like "M" = month or "A" = year


# In[201]:


# Lets resample with month
data2.resample("M").mean()
# As you can see there are a lot of nan because data2 does not include all months


# In[204]:


data2.resample("M").first().interpolate('linear')


# In[210]:


# read data
data = pd.read_csv('pokemon.csv')
data= data.set_index("#")
data.head()


# In[211]:


# indexing using square brackets
data["HP"][1]


# In[212]:


# using column attribute and row label
data.HP[1]


# In[213]:


# using loc accessor
data.loc[1,["HP"]]


# In[215]:


# Selecting only some columns
data[["HP","Attack"]].head()


# In[216]:


# Slicing and indexing series
data.loc[1:10,"HP":"Defense"]   # 10 and "Defense" are inclusive


# In[218]:


# Reverse slicing 
data.loc[10:1:-1,"HP":"Defense"] 


# In[219]:


# From something to end
data.loc[1:10,"Speed":] 


# In[220]:


# Plain python functions
def div(n):
    return n/2
data.HP.apply(div)


# In[222]:


data['HP'].apply(lambda x:x/2)


# In[223]:


# Defining column using other columns
data["total_power"] = data.Attack + data.Defense
data.head()


# In[225]:


# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "index_name"
data.head()


# In[226]:


# Overwrite index
# if we want to modify index we need to change all of them.
data.head()
# first copy of our data to data3 then change index 
data3 = data.copy()
# lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,900,1)
data3.head()


# In[228]:


data = pd.read_csv('pokemon.csv')


# In[231]:


# Setting index : type 1 is outer type 2 is inner index
data1 = data.set_index(["Type 1","Type 2"]) 
data1.head(100)
# data1.loc["Fire","Flying"] # howw to use indexes


# In[235]:


dic = {"treatment":["A","A","B","B"],"gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df


# In[236]:


df.pivot(index = 'treatment',columns='gender',values = 'age')


# In[237]:


df1 = df.set_index(["treatment","gender"])
df1
# lets unstack it


# In[239]:


df1.unstack(level=0)


# In[240]:


df1.unstack(level=1)


# In[241]:


# change inner and outer level index position
df2 = df1.swaplevel(0,1)
df2


# In[242]:


df


# In[243]:


# df.pivot(index="treatment",columns = "gender",values="response")
pd.melt(df,id_vars="treatment",value_vars=["age","response"])


# In[244]:


df


# In[247]:


# according to treatment take means of other features
df.groupby("treatment").mean()   # mean is aggregation / reduction method
# there are other methods like sum, std,max or min


# In[248]:


# we can only choose one of the feature
df.groupby("treatment").age.mean() 


# In[249]:


# Or we can choose multiple features
df.groupby("treatment")[["age","response"]].mean() 


# In[250]:


df.info()
# as you can see gender is object
# However if we use groupby, we can convert it categorical data. 
# Because categorical data uses less memory, speed up operations like groupby
df["gender"] = df["gender"].astype("category")
df["treatment"] = df["treatment"].astype("category")
df.info()


# In[ ]:





你可能感兴趣的:(python数据分析)