《使用pandas进行数据分析》网课笔记(19到23)

十九. How do I select multiple rows and columns in a DataFrame


# coding: utf-8

# In[1]:


#19
import pandas as pd


# In[2]:


ufo = pd.read_csv('http://bit.ly/uforeports')


# In[3]:


ufo.head(3)


# In[4]:


ufo.loc[0,:]


# In[7]:


#以下两种表达方式是等价的
#ufo.loc[[0,1,2],:]
ufo.loc[0:2,:]


# In[12]:


ufo.loc[:,['City','State']]


# In[13]:


ufo.loc[:,'City':'State']


# In[22]:


ufo.loc[0:2,'City':'State']


# In[15]:


#
ufo.head(3).drop('Time',axis=1)


# In[16]:


ufo[ufo.City == 'Oakland']


# In[17]:


ufo.loc[ufo.City=='Oakland','State']


# In[18]:


#------------------ iloc --------------------#
ufo.iloc[:,0:4]


# In[26]:


ufo.iloc[0:3,:]


# In[31]:


#Method 1: Here the outer bracket means to select from the ufo datasets.
#While the inner one means the list to choose from the column.
#ufo[['City','State']]

#Method 2:   
ufo.loc[:,['City','State']]


# In[33]:


#Method 1:  This refers to the rows.
#ufo[0:2]

#Method 2:  This way has a more explicit logic.
ufo.iloc[0:2,:]


# In[35]:


#---------------------Another dataset-----------------#
drinks = pd.read_csv('http://bit.ly/drinksbycountry',index_col = 'country')


# In[36]:


drinks.head()


# In[37]:


drinks.ix['Albania',0]


# In[38]:


drinks.ix['Albania':'Andorra',0:2]


# In[39]:


ufo.ix[0:2,0:2]


# In[ ]:


#Notice: ix will be deprecated in the future

二十. When should I use the "inplace" parameter in Pandas


# coding: utf-8

# In[1]:


#20
import pandas as pd


# In[18]:


ufo = pd.read_csv('http://bit.ly/uforeports')


# In[19]:


ufo.shape


# In[20]:


ufo.head()


# In[21]:


#Here we should pay attention to the 'inplace = False', 
#when this parameter is False,it means it will not take effect immediately.
#Thus, if we want to make the changes immediately, we should set 'inplace = True'
ufo.drop('City',axis = 1, inplace = True)


# In[22]:


ufo.head()


# In[23]:


ufo.dropna(how = 'any').shape


# In[24]:


ufo.shape


# In[25]:


#The two methods below are equivalent
#Method 1:
#ufo.set_index('Time',inplace = True)
#Method 2:
ufo = ufo.set_index('Time')


# In[26]:


ufo.tail()


# In[27]:


#这里讲如何让填充
ufo.fillna(method = 'ffill').tail()


# In[28]:


ufo.fillna(method = 'bfill').tail()

二十一. How do I make my Pandas DataFrame smaller and faster


# coding: utf-8

# In[2]:


#21
import pandas as pd


# In[3]:


drinks = pd.read_csv('http://bit.ly/drinksbycountry')


# In[4]:


drinks.head()


# In[5]:


drinks.info()


# In[6]:


drinks.info(memory_usage='deep')


# In[7]:


drinks.memory_usage()


# In[8]:


drinks.memory_usage(deep = True)


# In[9]:


drinks.memory_usage(deep = True).sum()


# In[10]:


sorted(drinks.continent.unique())


# In[11]:


drinks.continent.head()


# In[12]:


drinks['continent'] = drinks.continent.astype('category')


# In[13]:


drinks.dtypes


# In[14]:


drinks.continent.head()


# In[16]:


drinks.continent.cat.codes.head()


# In[17]:


drinks.memory_usage(deep = True)


# In[19]:


df = pd.DataFrame({'ID':[100,101,102,103],'Quality':['good','very good',"good","excellent"]})


# In[20]:


df


# In[22]:


df.sort_values('Quality')


# In[39]:


df['Quality'] = df.Quality.astype(pd.api.types.CategoricalDtype(categories = ['good','very good','excellent'], ordered= True))


# In[40]:


df.Quality


# In[41]:


df.sort_values('Quality')


# In[42]:


df.loc[df.Quality > 'good',:]

二十二. How do I use scikit-learn to create Kaggle submission?er


# coding: utf-8

# In[1]:


#22
import pandas as pd


# In[2]:


train = pd.read_csv('http://bit.ly/kaggletrain')


# In[3]:


train.head()


# In[4]:


feature_cols = ['Pclass','Parch']


# In[5]:


X = train.loc[:,feature_cols]


# In[6]:


X.shape


# In[7]:


y = train.Survived


# In[8]:


y.shape


# In[10]:


from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X,y)


# In[11]:


test = pd.read_csv('http://bit.ly/kaggletest')


# In[12]:


test.head()


# In[13]:


X_new = test.loc[:,feature_cols]


# In[14]:


X_new.shape


# In[15]:


new_pred_class = logreg.predict(X_new)


# In[17]:


new_pred_class


# In[19]:


pd.DataFrame({'PassengerId':test.PassengerId,'Survived':new_pred_class})

二十三. More of your Pandas questions answered


# coding: utf-8

# In[17]:


#23
import pandas as pd


# In[18]:


ufo = pd.read_csv('http://bit.ly/uforeports')


# In[19]:


#以下两种方法等价
pd.isnull(ufo).head()


# In[20]:


ufo.isnull().head()


# In[21]:


ufo.loc[0:4,:]


# In[22]:


#iloc来自于numpy的分割方式,所以以下这行,只有四个项
ufo.iloc[0:4,:]


# In[23]:


ufo.loc[:,'City':'State']


# In[24]:


ufo.sample(n=3,random_state=23)


# In[25]:


train = ufo.sample(frac=0.75,random_state=99)


# In[28]:


train


# In[27]:


test = ufo.loc[ufo.index.isin(train.index),:]


# In[29]:


test

 

 

 

你可能感兴趣的:(数据挖掘)