豆瓣、网易云、谷歌、微博热榜词云

1、豆瓣

import requests
import json
import time
import csv
import pymysql
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
import hashlib
# import time
import random
import http.client
from lxml import etree
import urllib
import datetime
import re
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl  # 配置字体
mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"] #配置字体,不然汉字有的显示不正常
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'


#tv
lists = []
lists_aa =[]
for i in range(5):
    url1 = "https://movie.douban.com/j/search_subjects?type=tv&tag=国产剧&sort=recommend&page_limit=20&page_start={}".format(i*20)
    headers = {    


    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }

    html1 = requests.get(url1,headers = headers).json()
    for j in range(len(html1['subjects'])):
        title = html1['subjects'][j]['title'].replace(' ','')
        rate = html1['subjects'][j]['rate']
        print(title+" "+rate)
        lists_aa.append(title+" "+rate)
        lists.append(title)
    

text=' '.join(lists)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
# word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()



#综艺
lists1 = []
lists_bb=[]
for i in range(5):
    url1 = "https://movie.douban.com/j/search_subjects?type=tv&tag=综艺&sort=recommend&page_limit=20&page_start={}".format(i*20)
    headers = {    


    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }

    html1 = requests.get(url1,headers = headers).json()
    for j in range(len(html1['subjects'])):
        title = html1['subjects'][j]['title'].replace(' ','')
        rate = html1['subjects'][j]['rate']
        print(title+" "+rate)
        lists_bb.append(title+" "+rate)
        lists1.append(title)


text=' '.join(lists1)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
# word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()


#美剧
lists2 = []
for i in range(5):
    url1 = "https://movie.douban.com/j/search_subjects?type=tv&tag=美剧&sort=recommend&page_limit=20&page_start={}".format(i*20)
    headers = {    


    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }

    html1 = requests.get(url1,headers = headers).json()
    for j in range(len(html1['subjects'])):
        title = html1['subjects'][j]['title'].replace(' ','')
        rate = html1['subjects'][j]['rate']
        print(title+" "+rate)
        lists2.append(title)


text=' '.join(lists2)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
# word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()


#电影
lists3 = []
lists_cc=[]
for i in range(5):
    url1 = "https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=recommend&page_limit=20&page_start={}".format(i*20)
    headers = {    


    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }

    html1 = requests.get(url1,headers = headers).json()
    for j in range(len(html1['subjects'])):
        title = html1['subjects'][j]['title'].replace(' ','')
        rate = html1['subjects'][j]['rate']
        print(title+" "+rate)
        lists_cc.append(title+" "+rate)
        lists3.append(title)


text=' '.join(lists3)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
#word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

2、网易云

import requests
import json
import time
import csv
import pymysql
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
import hashlib
# import time
import random
import http.client
from lxml import etree
import urllib
import datetime
import re
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl  # 配置字体
mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"] #配置字体,不然汉字有的显示不正常
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'


#us
url1 = "http://music.163.com/api/playlist/detail?id=60198"
headers = {
    "referer":"https://music.163.com/",

"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
html = requests.get(url1,headers = headers).json()



#中国热榜
url1 = "http://music.163.com/api/playlist/detail?id=3778678"
headers = {
    "referer":"https://music.163.com/",

"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
html = requests.get(url1,headers = headers).json()


lists=[]
lists1=[]
for i in range(len(html['result']['tracks'])):
    name = html['result']['tracks'][i]['name']
    airtist = html['result']['tracks'][i]['artists'][0]['name']
    lists.append(name)
    lists.append(airtist)
    lists1.append(str(i+1) +" " + name + "-"+ airtist)
print(lists1)


text=' '.join(lists)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
# word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()




3、谷歌

import requests
import json
import time
import csv
import pymysql
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
import hashlib
# import time
import random
import http.client
from lxml import etree
import urllib
import datetime
import re

from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl  # 配置字体
mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"] #配置字体,不然汉字有的显示不正常
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'


#us
url = "https://trends.google.com/trends/api/dailytrends?hl=en-US&tz=-480&ed=20200409&geo=US&ns=15"
html = requests.get(url).text
html1 = eval(re.findall('({"default.*)',html)[0])

#japan
url2 = "https://trends.google.com/trends/api/dailytrends?hl=en-US&tz=-480&ed=20200409&geo=JP&ns=15"
html2 = requests.get(url2).text
html22 = eval(re.findall('({"default.*)',html2)[0])


lists = []
#geo=JP  US GB FR DE  HK TW

datas =[21,20,19,18,17]
for mm in datas:
    url = "https://trends.google.com/trends/api/dailytrends?hl=en-US&tz=-480&ed=202004{}&geo=US&ns=15".format(str(mm))
    html = requests.get(url).text
    html1 = eval(re.findall('({"default.*)',html)[0])

    for i in range(len(html1['default']['trendingSearchesDays'][0]['trendingSearches'])):
        title = html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['title']['query']
        print(title)
        lists.append(title)
        if html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries']:
            for j in range(len(html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries'])):
                query = html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries'][j]['query']
                lists.append(query)
        if html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles']:
            for k in range(len(html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles'])):
                content = html1['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles'][k]['title']
                lists.append(content)




#japan
lists2 = []
for i in range(len(html22['default']['trendingSearchesDays'][0]['trendingSearches'])):
    title = html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['title']['query']
    print(title)
    lists2.append(title)
    if html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries']:
        for j in range(len(html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries'])):
            query = html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['relatedQueries'][j]['query']
            lists2.append(query)
    if html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles']:
        for k in range(len(html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles'])):
            content = html22['default']['trendingSearchesDays'][0]['trendingSearches'][i]['articles'][k]['title']
            lists2.append(content)


from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl  # 配置字体
mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"] #配置字体,不然汉字有的显示不正常
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'
import jieba
import jieba.analyse
jieba.analyse.set_stop_words(path)


# tags = jieba.analyse.extract_tags(" ".join(contents), topK=50 ,withWeight=True)
tags = jieba.analyse.extract_tags(" ".join('%s' %id for id in lists_us_zh).replace('None',""), topK=100)
# tags = jieba.analyse.extract_tags(" ".join(lists), topK=100)
tags

text=' '.join(tags)
wc = WordCloud(font_path="/Users/lonng/Library/Fonts/msyh.ttf",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
# word_cloud.to_file('3.png')
plt.figure(figsize=(26,26))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
            

4、微博

import requests
import  re
import jieba
import jieba.analyse
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl  # 配置字体
import json
import hashlib
import time
import random


headers = {
    "Referer":"https://m.weibo.cn/p/index?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot",
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36",
    

}
url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30"
html = requests.get(url,headers = headers).text
html1 = json.loads(html)


contents = []
for i in range(len(html1['data']['cards'][0]['card_group'])):
    words = html1['data']['cards'][0]['card_group'][i]['desc']
    
    contents.append(words)
for j in range(len(html1['data']['cards'][1]['card_group'])):
    word1 = html1['data']['cards'][1]['card_group'][j]['desc']
    contents.append(word1)


mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"] #配置字体,不然汉字有的显示不正常
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'
jieba.analyse.set_stop_words(path)
# tags1 = jieba.analyse.extract_tags(" ".join(contents), topK=50 ,withWeight=True)
tags1 = jieba.analyse.extract_tags(" ".join(contents), topK=100)
tags1


text=' '.join(contents[:50])
wc = WordCloud(font_path="/System/Library/Fonts/PingFang.ttc",background_color='black',max_words=100,max_font_size=120,scale=10)
word_cloud = wc.generate(text)
plt.figure(figsize=(25,25))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

你可能感兴趣的:(爬虫,知识点)