左手python右手R

写在前面

最近在学习python,结合一个实际案例,写一下python和R在做数据分析上的差异。
本人还不是特别熟练python,所以python的代码来自于kagle的一个高vote回帖。
我这里只是转写一下R的版本,转写python代码之后感觉python做数据分析和可视化实在不如R给力。代码丢这了,有机会说说如何用tidyverse分析数据吧。
这里写了多数代码,剩下流程差不多的就放弃写了。还有机器学习的部分回头有心情了用tidymodels写一下基本的框架吧。


Netflix is an application that keeps growing bigger and faster with its popularity, shows and content. This is an EDA
or a story telling through its data along with a content-based recommendation system and a wide range of different
graphs and visuals.

image.png

The python source code is from here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
library(tidyverse)
library(skimr)
# Loading the dataset
data <- tidytuesdayR::tt_load('2021-04-20')
netfix_dta <- data$netflix_titles
# install a module if your python don't have
# reticulate::py_install('seaborn',pip = TRUE) 

Pass the data to Python from R in rstudio


netflix_overall=r.netfix_dta
netflix_overall.head()

Also, you can do the same thing using R

head(netfix_dta)

Or

glimpse(netfix_dta)

Therefore, it is clear that the dataset contains 12 columns for exploratory analysis.

netflix_overall.count()

Also, in R you can do it better.

skim(netfix_dta)

netflix_shows=netflix_overall[netflix_overall['type']=='TV Show']
netflix_shows.head()

In R, you can use pipe to repeat, which makes your script easy to read.

netflix_shows <- netfix_dta %>%
  filter(type == "TV Show")

head(netflix_shows)

netflix_movies=netflix_overall[netflix_overall['type']=='Movie']
netflix_movies <- netfix_dta %>%
  filter(type == "Movie")

Analysis of Movies vs TV Shows.


sns.set(style="darkgrid") 
ax = sns.countplot(x="type", data=netflix_overall, palette="Set2")
plt.show()

In R

netfix_dta %>% 
  ggplot(aes(x = fct_rev(type), fill = type)) + 
  geom_bar() + 
  theme_bw()

It is evident that there are more Movies on Netflix than TV shows.

```{python} md

# If a producer wants to release some content, which month must he do so?( Month when least amount of content is added)

```{python}
netflix_date = netflix_shows[['date_added']].dropna()
netflix_date['year'] = netflix_date['date_added'].apply(lambda x : x.split(', ')[-1])
netflix_date['month'] = netflix_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = netflix_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(df, cmap='afmhot_r', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize=7, fontfamily='serif')

plt.title('Netflix Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()

cbar.ax.tick_params(labelsize=8) 
cbar.ax.minorticks_on()
plt.show()
library(lubridate)
library(viridis)

netfix_dta %>% 
  select(date_added) %>% 
  mutate(date_added = mdy(date_added),
         month = month(date_added, label = TRUE, abbr = FALSE),
         year = year(date_added)) %>% 
  group_by(year, month) %>% 
  filter(!is.na(month)) %>% 
  summarise(contents = n()) %>% 
  ggplot(aes(x = year, y = fct_rev(month), fill = contents)) + 
  geom_tile() + 
  viridis::scale_fill_viridis(option = "A") + 
  labs(title = 'Netflix Contents Update',
       x = '',
       y = '')

Movie ratings analysis

plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])
plt.show()

In R

netfix_dta %>% 
  group_by(rating) %>% 
  summarise(n = n()) %>% 
  filter(!is.na(rating)) %>% 
  ggplot(aes(x = fct_reorder(rating,n, .desc = TRUE), y = n, fill = rating)) + 
  geom_bar(stat = "identity", show.legend = F) + 
  scale_y_continuous(expand = expansion(c(0,.1))) + 
  labs(
    x = 'Rating',
    y = 'Count'
  )

Analysing IMDB ratings to get top rated movies on Netflix

imdb_ratings=pd.read_csv('netflix/IMDb ratings.csv',usecols=['weighted_average_vote'])

imdb_titles=pd.read_csv('netflix/IMDb movies.csv', usecols=['title','year','genre'])

ratings = pd.DataFrame({'Title':imdb_titles.title, 'Release Year':imdb_titles.year, 'Rating': imdb_ratings.weighted_average_vote, 'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title','Release Year','Rating'], inplace=True)
ratings.shape

ratings.head()

In R

imdb_ratings <- read_csv('netflix/IMDb ratings.csv') %>% 
  select(1,2)
imdb_titles <- read_csv('netflix/IMDb movies.csv') %>% 
  select(1, title, year, genre)

ratings <- left_join(imdb_titles, imdb_ratings, by = "imdb_title_id") %>% 
  select(-1) %>% 
  select(1:3,Rating = "weighted_average_vote")
ratings
dim(ratings)
ratings.dropna()
joint_data=ratings.merge(netflix_overall,left_on='Title',right_on='title',how='inner')
joint_data=joint_data.sort_values(by='Rating', ascending=False)

joint_data.head()
joint_data.shape
joint_data <- ratings %>% 
  filter(!is.na(.)) %>% 
  inner_join(., netfix_dta, by = "title") %>% 
  arrange(desc(Rating))

dim(joint_data)
import plotly.express as px
top_rated=joint_data[0:10]
top_rated
fig =px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()
library(plotly)
top_rated <- joint_data[1:10,]
fig <- plot_ly(
  ids = c(top_rated$title, paste0(top_rated$title,"-",top_rated$country)),
  labels = c(top_rated$title,top_rated$country),
  parents = c(rep('',10), top_rated$title),
  colors = c(top_rated$Rating,top_rated$Rating),
  type = "sunburst",
  branchvalues = 'total'
)

fig
fig =px.sunburst(
    r.top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

Countries with highest rated content.

country_count=joint_data['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)
topcountries=country_count[0:11]
topcountries
topcountries <- joint_data %>% 
  group_by(country) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  filter(!is.na(country))
import plotly.express as px
data = dict(
    number=[1063,619,135,60,44,41,40,40,38,35],
    country=["United States", "India", "United Kingdom", "Canada", "Spain",'Turkey','Philippines','France','South Korea','Australia'])
fig = px.funnel(data, x='number', y='country')
fig.show()
library(reticulate)
data <- py$data %>% 
  as.data.frame() %>% 
  arrange(desc(number))

plot_ly(
  y = data$country,
  x = data$number,
  type = "funnel",
) %>% 
  layout(yaxis = list(categoryarray = data$country))

Year wise analysis

plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="release_year", data=netflix_movies, palette="Set2", order=netflix_movies['release_year'].value_counts().index[0:15])
plt.show()
netflix_movies %>% 
  group_by(release_year) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  slice(1:15) %>% 
  mutate(release_year = factor(release_year, levels = release_year)) %>% 
  ggplot(aes(y = fct_rev(release_year), x = n, fill = release_year)) + 
  geom_bar(stat = "identity",show.legend = FALSE) + 
  ggsci::scale_fill_simpsons()

Analysis of duration of movies¶

netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(int)
netflix_movies['duration']
plt.figure(figsize=(8,8))
sns.set(style="darkgrid")
sns.kdeplot(data=netflix_movies['duration'], shade=True)
plt.show()
netflix_movies %>% 
  mutate(duration = str_remove(duration, " min") %>% as.double()) %>% 
  ggplot(aes(x = duration)) + 
           geom_density(fill = "blue2",alpha = .4) + 
  ggthemes::theme_solarized()
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from collections import Counter

genres=list(netflix_movies['listed_in'])
gen=[]

for i in genres:
    i=list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',""))
g=Counter(gen)

text = list(set(gen))
plt.rcParams['figure.figsize'] = (13, 13)

wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))

plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
library(wordcloud)
library(tidytext)
set.seed(2021)
netflix_movies %>% 
  unnest_tokens(word, listed_in) %>% 
  count(word, sort = TRUE) %>% 
  with(wordcloud(word, n, max.words = 100))


matplotlib.use('TkAgg')
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}
g
fig, ax = plt.subplots()

x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# set a title
ax.set_title("Genres")
plt.show()
g <- py$g %>% unlist() %>% data.frame() %>% select(n = ".")

g %>% 
  mutate(name = rownames(g),
         name = fct_reorder(name, n, .desc = TRUE)) %>% 
  ggplot(aes(x = name, y = n)) + 
  geom_segment(aes(x = name, xend = name, y= 0, yend = n)) + 
  geom_point(size = 5, color = 'orange') + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

Lowest number of seasons.

features=['title','duration']
durations= netflix_shows[features]

durations['no_of_seasons']=durations['duration'].str.replace(' Season','')

#durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)

t=['title','no_of_seasons']
top=durations[t]

top=top.sort_values(by='no_of_seasons', ascending=False)
bottom=top.sort_values(by='no_of_seasons')
bottom=bottom[20:50]

import plotly.graph_objects as go
# Set the width and height of the figure
plt.figure(figsize=(15,15))
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'No of seasons']), cells=dict(values=[bottom['title'],bottom['no_of_seasons']],fill_color='lavender'))])
fig.show()
library(kableExtra)
netflix_shows %>% 
  select(title, duration) %>% 
  separate(duration, ' ',into = c('duration','season')) %>% 
  mutate(duration = as.numeric(duration)) %>% 
  arrange(desc(duration)) %>% 
  kbl() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

你可能感兴趣的:(左手python右手R)