generate函数中默认以空格(不仅是一个,可以多个)作为分隔符:
import matplotlib.pyplot as plt
backgroud_Image = plt.imread(r'E:\360MoveData\Users\ASUS\Desktop\tmp03\词云背景.jpg')
newtxt = ' '.join(words)#jion中为list
wordcloud = WordCloud(background_color="white", \
mask=backgroud_Image,\
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80, \
stopwords = excludes, \#excludes是停用词list
).generate(newtxt)## 产生词云,输入的格式是以空格分隔的词语组成的字符串
中文词云:
import jieba
from wordcloud import WordCloud
open(r"filepath",)
txt = f.read()
f.close()
words = jieba.lcut(txt)#结巴分词结果是一个list
newtxt = ' '.join(words)
wordcloud = WordCloud(background_color="white", \
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80, \
stopwords = excludes, \
).generate(newtxt)
wordcloud.to_file('词云1.png')
英文词云:
from wordcloud import WordCloud
with open(r'Englishfilepath', 'r', encoding='utf-8') as file:
text = file.read()
wordcloud = WordCloud(background_color="white", \
width=800, \
height=600, \
max_words=200, \
max_font_size=80, \
mask = mask, \
).generate(text)
# 保存图片
wordcloud.to_file('picturename.png')
英文词频统计1.0:
def pretreatment():
a = 'We need to use window.load, not document.ready, because in Chrome'
a1 = a.lower()
for b in ",.!、!@#$%^'": # 将所有除了单词以外的符号换成空格
a1.replace(b, '')
ls = a1.split()
for s in ls:
print(s)
return ls
def main():
d={}
for word in pretreatment():
# 如果d中无该k则d[newk]=newv可添加一个kv并计该新词频为一有该k则加一
d[word] = d.get(word, 0) + 1
for k in d:
print("{}:{}".format(k, d[k]))
main()
英文词频统计1.1(加排序):
def pretreatment():
a = 'We need to use window.load, not document.ready, because in Chrome'
a1 = a.lower()
for b in ",.!、!@#$%^'": # 将所有除了单词以外的符号换成空格
a1.replace(b, '')
ls = a1.split()
for s in ls:
print(s)
return ls
def main():
d={}
for word in pretreatment():
# 如果d中无该k则d[newk]=newv可添加一个kv并计该新词频为一有该k则加一
d[word] = d.get(word, 0) + 1
# 将字典转换为列表
dictList = list(d.items())
dictList.sort(key=lambda x: x[1], reverse=True)
# 输出TOP(2)
for i in range(2):
word, count = dictList[i]
print('{0:<20}{1:>10}'.format(word, count))
main()
中文词频统计排序可以看我的另一篇博客:
https://blog.csdn.net/qq_41228218/article/details/86765042
机器之心词云:
import networkx as nx
import matplotlib.pyplot as plt
# Create a directed graph
G = nx.DiGraph()
# The relationship between the edges of digraphs
with open(r'E:\360MoveData\Users\ASUS\Desktop\tmp\zhihuall\all.txt','r',encoding='utf-8') as f:
tolines=f.readlines()
set1=set(tolines)
tolines=list(set1)
edges=[]
for i in tolines:
try:
edges.append((i.split(',')[0].strip(),i.split(',')[1].strip()))
except:
print('something went wrong')
for edge in edges:
G.add_edge(edge[0], edge[1])
pagerank_list = nx.pagerank(G, alpha=1)
pagerank_list_order=sorted(pagerank_list.items(),key=lambda x:x[1],reverse=True)
# print("pagerank dictionary values:", pagerank_list_order)
list01=[]
for i in pagerank_list_order:
list01.append(i[0])
# print(i[0])
# with open('')
list01=list01[:50]
list02=[]
# print(list01)
# b=0
for a in range(len(list01)):
b=len(list01)
for c in range(b*10):
list02.append(list01[a])
b-=1
# print(list02)
newtxt = ' '.join(list02)
from wordcloud import WordCloud
# list01
newtxt = ' '.join(list01)
backgroud_Image = plt.imread(r'E:\360MoveData\Users\ASUS\Desktop\tmp03\词云背景.jpg')
wordcloud = WordCloud(background_color="gray", \
mask=backgroud_Image,\
width=800, \
height=600, \
font_path="msyh.ttc", \
max_words=200, \
max_font_size=80
).generate(newtxt)
wordcloud.to_file('词云11.png')