整理转载至https://www.kaggle.com/learn/overview
panda
- import
import pandas as pd
- create dataframe
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'],
'Sue': ['Pretty good.', 'Bland.']},
index=['Product A', 'Product B'])
- create Series
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')
- save data
# animals = pd.DataFrame({'Cows': [12, 20], 'Goats': [22, 19]}, index=['Year 1', 'Year 2'])
animals.to_csv('cows_and_goats.csv')
- read data
# save filepath to variable for easier access
melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
# option param:index_col
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)
# print a summary of the data in Melbourne data
melbourne_data.describe()
- show columns
melbourne_data.columns
- choose feature
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
- quickly review the data
X.describe()
X.head()
X.tail()
- print data
X
scikit-learn
- DecisionTreeRegressor
- define
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)
- fit model
# Fit model
melbourne_model.fit(X, y)
- predict
melbourne_model.predict(X.head())
- Mean Absolute Error(MAE)
error=actual−predicted
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)
- RandomForest
- define
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
seaborn
- import
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
- plot
# Set the width and height of the figure
plt.figure(figsize=(16,6))
# Add title
plt.title("FIFA rankings")
# Line chart showing how FIFA rankings evolved over time
sns.lineplot(data=fifa_data)
- plot subset
# Set the width and height of the figure
plt.figure(figsize=(14,6))
# Add title
plt.title("Daily Global Streams of Popular Songs in 2017-2018")
# Line chart showing daily global streams of 'Shape of You'
sns.lineplot(data=spotify_data['Shape of You'], label="Shape of You")
# Line chart showing daily global streams of 'Despacito'
sns.lineplot(data=spotify_data['Despacito'], label="Despacito")
# Add label for horizontal axis
plt.xlabel("Date")
- bar chart
# Set the width and height of the figure
plt.figure(figsize=(10,6))
# Add title
plt.title("Average Arrival Delay for Spirit Airlines Flights, by Month")
# Bar chart showing average arrival delay for Spirit Airlines flights by month
sns.barplot(x=flight_data.index, y=flight_data['NK'])
# Add label for vertical axis
plt.ylabel("Arrival delay (in minutes)")
- heatmap
# Set the width and height of the figure
plt.figure(figsize=(14,7))
# Add title
plt.title("Average Arrival Delay for Each Airline, by Month")
# Heatmap showing average arrival delay for each airline by month
# annot=True:This ensures that the values for each cell appear on the chart. (Leaving this out removes the numbers from each of the cells!)
sns.heatmap(data=flight_data, annot=True)
# Add label for horizontal axis
plt.xlabel("Airline")
- scatter plots
sns.scatterplot(x=insurance_data['bmi'],y=insurance_data['charges'])
- regplot
sns.regplot(x=insurance_data['bmi'], y=insurance_data['charges'])
- Color-coded scatter plots
sns.scatterplot(x=insurance_data['bmi'],y=insurance_data['charges'], hue=insurance_data['smoker'])
- lmplot
sns.lmplot(x="bmi",y="charges",hue="smoker",data=insurance_data)
- categorical scatter plot,
sns.swarmplot(x=insurance_data['smoker'],y=insurance_data['charges'])
- distplot
# kde=False is something we'll always provide when creating a histogram, as leaving it out will create a slightly different plot.
sns.distplot(a=iris_data['Petal Length (cm)'], kde=False)
- Density plots
# shade=True colors the area below the curve
sns.kdeplot(data=iris_data['Petal Length (cm)'], shade=True)
- 2D KDE plots
sns.jointplot(x=iris_data['Petal Length (cm)'], y=iris_data['Sepal Width (cm)'], kind="kde")