整理转载至https://www.kaggle.com/learn/overview

panda

import

import pandas as pd

create dataframe

pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

create Series

pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

save data

# animals = pd.DataFrame({'Cows': [12, 20], 'Goats': [22, 19]}, index=['Year 1', 'Year 2'])
animals.to_csv('cows_and_goats.csv')

read data

# save filepath to variable for easier access
melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
# option param:index_col
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data
melbourne_data.describe()

show columns

melbourne_data.columns

choose feature

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

quickly review the data

X.describe()
X.head()
X.tail()

print data

scikit-learn

DecisionTreeRegressor

define

from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

fit model

# Fit model
melbourne_model.fit(X, y)

predict

melbourne_model.predict(X.head())

Mean Absolute Error（MAE）
error=actual−predicted

from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

RandomForest

define

from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)

seaborn

import

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plot

# Set the width and height of the figure
plt.figure(figsize=(16,6))
# Add title
plt.title("FIFA rankings")
# Line chart showing how FIFA rankings evolved over time 
sns.lineplot(data=fifa_data)

plot subset

# Set the width and height of the figure
plt.figure(figsize=(14,6))

# Add title
plt.title("Daily Global Streams of Popular Songs in 2017-2018")

# Line chart showing daily global streams of 'Shape of You'
sns.lineplot(data=spotify_data['Shape of You'], label="Shape of You")

# Line chart showing daily global streams of 'Despacito'
sns.lineplot(data=spotify_data['Despacito'], label="Despacito")

# Add label for horizontal axis
plt.xlabel("Date")

bar chart

# Set the width and height of the figure
plt.figure(figsize=(10,6))

# Add title
plt.title("Average Arrival Delay for Spirit Airlines Flights, by Month")

# Bar chart showing average arrival delay for Spirit Airlines flights by month
sns.barplot(x=flight_data.index, y=flight_data['NK'])

# Add label for vertical axis
plt.ylabel("Arrival delay (in minutes)")

heatmap

# Set the width and height of the figure
plt.figure(figsize=(14,7))

# Add title
plt.title("Average Arrival Delay for Each Airline, by Month")

# Heatmap showing average arrival delay for each airline by month
# annot=True:This ensures that the values for each cell appear on the chart. (Leaving this out removes the numbers from each of the cells!)
sns.heatmap(data=flight_data, annot=True)

# Add label for horizontal axis
plt.xlabel("Airline")

scatter plots

sns.scatterplot(x=insurance_data['bmi'],y=insurance_data['charges'])

regplot

sns.regplot(x=insurance_data['bmi'], y=insurance_data['charges'])

Color-coded scatter plots

sns.scatterplot(x=insurance_data['bmi'],y=insurance_data['charges'], hue=insurance_data['smoker'])

lmplot

sns.lmplot(x="bmi",y="charges",hue="smoker",data=insurance_data)

categorical scatter plot,

sns.swarmplot(x=insurance_data['smoker'],y=insurance_data['charges'])

distplot

# kde=False is something we'll always provide when creating a histogram, as leaving it out will create a slightly different plot.
sns.distplot(a=iris_data['Petal Length (cm)'], kde=False)

Density plots

# shade=True colors the area below the curve
sns.kdeplot(data=iris_data['Petal Length (cm)'], shade=True)

2D KDE plots

sns.jointplot(x=iris_data['Petal Length (cm)'], y=iris_data['Sepal Width (cm)'], kind="kde")

Kaggle教程的一些整理

panda

scikit-learn

seaborn

你可能感兴趣的:(Kaggle教程的一些整理)