十九. How do I select multiple rows and columns in a DataFrame
# coding: utf-8
# In[1]:
#19
import pandas as pd
# In[2]:
ufo = pd.read_csv('http://bit.ly/uforeports')
# In[3]:
ufo.head(3)
# In[4]:
ufo.loc[0,:]
# In[7]:
#以下两种表达方式是等价的
#ufo.loc[[0,1,2],:]
ufo.loc[0:2,:]
# In[12]:
ufo.loc[:,['City','State']]
# In[13]:
ufo.loc[:,'City':'State']
# In[22]:
ufo.loc[0:2,'City':'State']
# In[15]:
#
ufo.head(3).drop('Time',axis=1)
# In[16]:
ufo[ufo.City == 'Oakland']
# In[17]:
ufo.loc[ufo.City=='Oakland','State']
# In[18]:
#------------------ iloc --------------------#
ufo.iloc[:,0:4]
# In[26]:
ufo.iloc[0:3,:]
# In[31]:
#Method 1: Here the outer bracket means to select from the ufo datasets.
#While the inner one means the list to choose from the column.
#ufo[['City','State']]
#Method 2:
ufo.loc[:,['City','State']]
# In[33]:
#Method 1: This refers to the rows.
#ufo[0:2]
#Method 2: This way has a more explicit logic.
ufo.iloc[0:2,:]
# In[35]:
#---------------------Another dataset-----------------#
drinks = pd.read_csv('http://bit.ly/drinksbycountry',index_col = 'country')
# In[36]:
drinks.head()
# In[37]:
drinks.ix['Albania',0]
# In[38]:
drinks.ix['Albania':'Andorra',0:2]
# In[39]:
ufo.ix[0:2,0:2]
# In[ ]:
#Notice: ix will be deprecated in the future
二十. When should I use the "inplace" parameter in Pandas
# coding: utf-8
# In[1]:
#20
import pandas as pd
# In[18]:
ufo = pd.read_csv('http://bit.ly/uforeports')
# In[19]:
ufo.shape
# In[20]:
ufo.head()
# In[21]:
#Here we should pay attention to the 'inplace = False',
#when this parameter is False,it means it will not take effect immediately.
#Thus, if we want to make the changes immediately, we should set 'inplace = True'
ufo.drop('City',axis = 1, inplace = True)
# In[22]:
ufo.head()
# In[23]:
ufo.dropna(how = 'any').shape
# In[24]:
ufo.shape
# In[25]:
#The two methods below are equivalent
#Method 1:
#ufo.set_index('Time',inplace = True)
#Method 2:
ufo = ufo.set_index('Time')
# In[26]:
ufo.tail()
# In[27]:
#这里讲如何让填充
ufo.fillna(method = 'ffill').tail()
# In[28]:
ufo.fillna(method = 'bfill').tail()
二十一. How do I make my Pandas DataFrame smaller and faster
# coding: utf-8
# In[2]:
#21
import pandas as pd
# In[3]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
# In[4]:
drinks.head()
# In[5]:
drinks.info()
# In[6]:
drinks.info(memory_usage='deep')
# In[7]:
drinks.memory_usage()
# In[8]:
drinks.memory_usage(deep = True)
# In[9]:
drinks.memory_usage(deep = True).sum()
# In[10]:
sorted(drinks.continent.unique())
# In[11]:
drinks.continent.head()
# In[12]:
drinks['continent'] = drinks.continent.astype('category')
# In[13]:
drinks.dtypes
# In[14]:
drinks.continent.head()
# In[16]:
drinks.continent.cat.codes.head()
# In[17]:
drinks.memory_usage(deep = True)
# In[19]:
df = pd.DataFrame({'ID':[100,101,102,103],'Quality':['good','very good',"good","excellent"]})
# In[20]:
df
# In[22]:
df.sort_values('Quality')
# In[39]:
df['Quality'] = df.Quality.astype(pd.api.types.CategoricalDtype(categories = ['good','very good','excellent'], ordered= True))
# In[40]:
df.Quality
# In[41]:
df.sort_values('Quality')
# In[42]:
df.loc[df.Quality > 'good',:]
二十二. How do I use scikit-learn to create Kaggle submission?er
# coding: utf-8
# In[1]:
#22
import pandas as pd
# In[2]:
train = pd.read_csv('http://bit.ly/kaggletrain')
# In[3]:
train.head()
# In[4]:
feature_cols = ['Pclass','Parch']
# In[5]:
X = train.loc[:,feature_cols]
# In[6]:
X.shape
# In[7]:
y = train.Survived
# In[8]:
y.shape
# In[10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X,y)
# In[11]:
test = pd.read_csv('http://bit.ly/kaggletest')
# In[12]:
test.head()
# In[13]:
X_new = test.loc[:,feature_cols]
# In[14]:
X_new.shape
# In[15]:
new_pred_class = logreg.predict(X_new)
# In[17]:
new_pred_class
# In[19]:
pd.DataFrame({'PassengerId':test.PassengerId,'Survived':new_pred_class})
二十三. More of your Pandas questions answered
# coding: utf-8
# In[17]:
#23
import pandas as pd
# In[18]:
ufo = pd.read_csv('http://bit.ly/uforeports')
# In[19]:
#以下两种方法等价
pd.isnull(ufo).head()
# In[20]:
ufo.isnull().head()
# In[21]:
ufo.loc[0:4,:]
# In[22]:
#iloc来自于numpy的分割方式,所以以下这行,只有四个项
ufo.iloc[0:4,:]
# In[23]:
ufo.loc[:,'City':'State']
# In[24]:
ufo.sample(n=3,random_state=23)
# In[25]:
train = ufo.sample(frac=0.75,random_state=99)
# In[28]:
train
# In[27]:
test = ufo.loc[ufo.index.isin(train.index),:]
# In[29]:
test