做词云与词频统计(中英文词云与中英文词频统计)的区别

generate函数中默认以空格(不仅是一个,可以多个)作为分隔符:

import matplotlib.pyplot as plt
backgroud_Image = plt.imread(r'E:\360MoveData\Users\ASUS\Desktop\tmp03\词云背景.jpg')

newtxt = '  '.join(words)#jion中为list
wordcloud = WordCloud(background_color="white", \
mask=backgroud_Image,\
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80, \
stopwords = excludes, \#excludes是停用词list
).generate(newtxt)## 产生词云,输入的格式是以空格分隔的词语组成的字符串

中文词云:

import jieba
from wordcloud import WordCloud
open(r"filepath",)
	txt = f.read()
	f.close()
words = jieba.lcut(txt)#结巴分词结果是一个list
newtxt = ' '.join(words)
wordcloud = WordCloud(background_color="white", \
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80, \
stopwords = excludes, \
).generate(newtxt)
wordcloud.to_file('词云1.png')

英文词云:

from wordcloud import WordCloud
with open(r'Englishfilepath', 'r', encoding='utf-8') as file:
text = file.read()
wordcloud = WordCloud(background_color="white", \
width=800, \
height=600, \
max_words=200, \
max_font_size=80, \
mask = mask, \
).generate(text)
# 保存图片
wordcloud.to_file('picturename.png')

英文词频统计1.0:

def pretreatment():
    a = 'We need to use window.load, not document.ready, because in Chrome'
    a1 = a.lower()
    for b in ",.!、!@#$%^'":  # 将所有除了单词以外的符号换成空格
        a1.replace(b, '')
    ls = a1.split()
    for s in ls:
        print(s)
    return ls
def main():
    d={}
    for word in pretreatment():
        # 如果d中无该k则d[newk]=newv可添加一个kv并计该新词频为一有该k则加一
        d[word] = d.get(word, 0) + 1
    for k in d:
        print("{}:{}".format(k, d[k]))
main()

英文词频统计1.1(加排序):

def pretreatment():
    a = 'We need to use window.load, not document.ready, because in Chrome'
    a1 = a.lower()
    for b in ",.!、!@#$%^'":  # 将所有除了单词以外的符号换成空格
        a1.replace(b, '')
    ls = a1.split()
    for s in ls:
        print(s)
    return ls
def main():
    d={}
    for word in pretreatment():
        # 如果d中无该k则d[newk]=newv可添加一个kv并计该新词频为一有该k则加一
        d[word] = d.get(word, 0) + 1
    # 将字典转换为列表
    dictList = list(d.items())

    dictList.sort(key=lambda x: x[1], reverse=True)

    # 输出TOP(2)
    for i in range(2):
        word, count = dictList[i]
        print('{0:<20}{1:>10}'.format(word, count))
main()


中文词频统计排序可以看我的另一篇博客:
https://blog.csdn.net/qq_41228218/article/details/86765042

机器之心词云:

import networkx as nx

import matplotlib.pyplot as plt
# Create a directed graph
G = nx.DiGraph()
# The relationship between the edges of digraphs
with open(r'E:\360MoveData\Users\ASUS\Desktop\tmp\zhihuall\all.txt','r',encoding='utf-8') as f:
	tolines=f.readlines()
	set1=set(tolines)
	tolines=list(set1)

edges=[]
for i in tolines:
	try:
		edges.append((i.split(',')[0].strip(),i.split(',')[1].strip()))
	except:
		print('something went wrong')

for edge in edges:
    G.add_edge(edge[0], edge[1])
pagerank_list = nx.pagerank(G, alpha=1)
pagerank_list_order=sorted(pagerank_list.items(),key=lambda x:x[1],reverse=True)

# print("pagerank dictionary values:", pagerank_list_order)
list01=[]
for i in pagerank_list_order:
	list01.append(i[0])
	# print(i[0])
	# with open('')
list01=list01[:50]
list02=[]
# print(list01)
# b=0
for a in range(len(list01)):
	b=len(list01)
	for c in range(b*10):
		list02.append(list01[a])
	b-=1
# print(list02)
newtxt = ' '.join(list02)
from wordcloud import WordCloud
# list01
newtxt = ' '.join(list01)

backgroud_Image = plt.imread(r'E:\360MoveData\Users\ASUS\Desktop\tmp03\词云背景.jpg')
wordcloud = WordCloud(background_color="gray", \
mask=backgroud_Image,\
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80
).generate(newtxt)
wordcloud.to_file('词云11.png')




你可能感兴趣的:(Python)