import jieba
word = jieba.cut('我爱北京天安门')
for i in word:
print(i)
注意:用cut()函数分词得到的word不是一个列表,而是一个迭代器。所谓迭代器其实和列表很相似,可以把它理解成一个“隐身的列表”。但是迭代器里的元素要通过for循环语句来访问,所以第3行代码和第4行代码不能改写成print(word)。
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
for word in words:
print(word)
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
print(report_words)
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
from collections import Counter
result = Counter(report_words)
print(result)
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
from collections import Counter
result = Counter(report_words).most_common(50) #取最多的50组
print(result)
# 1.读取文本内容,并用jieba库中的cut()函数进行分词
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
# 2.通过for循环语句提取列表words中长度大于等于4个字的词
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
#print(report_words)
# 3.获得打印输出高频词的出现次数
from collections import Counter
result = Counter(report_words).most_common(50) #取最多的50组
#print(result)
# 4.绘制词云图
from wordcloud import WordCloud #导入相关库
content = ' '.join(report_words) #把列表转换为字符串
wc = WordCloud(font_path='simhei.ttf',#字体文件路径(这里为黑体)
background_color='white',#背景颜色(这里为白色)
width=1000,#宽度
height=600,#高度
).generate(content) #绘制词云图
wc.to_file('词云图.png') #导出成PNG格式图片(使用相对路径)
from PIL import Image
from numpy as np
from wordcloud import WordCloud #导入相关库
# 1.读取文本内容,并用jieba库中的cut()函数进行分词
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
# 2.通过for循环语句提取列表words中长度大于等于4个字的词
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
#print(report_words)
# 3.获得打印输出高频词的出现次数
from collections import Counter
result = Counter(report_words).most_common(50) #取最多的50组
#print(result)
# 4.绘制特定形状的词云图
from PIL import Image
import numpy as np
from wordcloud import WordCloud #导入相关库
background_pic = '微博.jpg' #形状蒙版图片转换为数值数组
images = Image.open(background_pic) #打开形状蒙版图片
maskImages = np.array(images) #将形状蒙版图片转换为数值数组
content = ' '.join(report_words) #把列表转换为字符串
wc = WordCloud(font_path='simhei.ttf',#字体文件路径(这里为黑体)
background_color='white',#背景颜色(这里为白色)
width=1000,#宽度
height=600,#高度
mask = maskImages #应用形状蒙版
).generate(content) #绘制词云图
wc.to_file('词云图+自定义形状.png') #导出成PNG格式图片(使用相对路径)
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
# 1.读取文本内容,并用jieba库中的cut()函数进行分词
import jieba
report = open('信托行业报告.txt','r').read()
words = jieba.cut(report)
# 2.通过for循环语句提取列表words中长度大于等于4个字的词
report_words = []
for word in words: #将长度大于等于4个字的词放入列表
if len(word) >= 4:
report_words.append(word)
#print(report_words)
# 3.获得打印输出高频词的出现次数
from collections import Counter
result = Counter(report_words).most_common(50) #取最多的50组
#print(result)
# 4.绘制特定形状的词云图
# 获取词云图形状参数mask
from PIL import Image
import numpy as np
from wordcloud import WordCloud #导入相关库
background_pic = '微博.jpg' #形状蒙版图片转换为数值数组
images = Image.open(background_pic) #打开形状蒙版图片
maskImages = np.array(images) #将形状蒙版图片转换为数值数组
content = ' '.join(report_words) #把列表转换为字符串
wc = WordCloud(font_path='simhei.ttf',#字体文件路径(这里为黑体)
background_color='white',#背景颜色(这里为白色)
width=1000,#宽度
height=600,#高度
mask = maskImages #应用形状蒙版
).generate(content) #绘制词云图
# 修改词云图的底层颜色,这个blackgroud_pic就是之前的背景图片
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
back_color = imread(background_pic) #读取图片
image_colors = ImageColorGenerator(back_color) #获取图片的颜色
wc.recolor(color_func=image_colors) #为词云图上色
wc.to_file('词云图+自定义形状+颜色.png') #导出成PNG格式图片(使用相对路径)
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'}
# 1.获取网页源代码
url = 'https://s.weibo.com/weibo?q=阿里巴巴'
res = requests.get(url, headers=headers).text
print(res)
# 再去掉空格和中文括号里的内容(46和47行代码)
# 1.获取真实的网页源代码
import time
from selenium import webdriver
def get_browser():
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver
#if __name__ == '__main__':
url = 'https://s.weibo.com/weibo?q=阿里巴巴'
browser = get_browser()
browser.get(url)
time.sleep(6)
data = browser.page_source
#print(data)
# 2.使用正则表达式提取数据
import re
p_source = '
source = re.findall(p_source,data)
p_title = '(.*?)
'
title = re.findall(p_title,data,re.S)
# print(title)
# print(source)
# 3.数据清洗、打印输出、汇总
title_all = '' #创建一个空字符串,用于汇总数据
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>','',title[i])
title[i] = re.sub('[\u200b]','',title[i])
title[i] = re.sub('(.*?)','',title[i])
title[i] = re.sub(' ','',title[i])
title_all = title_all + title[i] #通过字符串拼接汇总数据
print(str(i + 1) + '.' + title[i] + '-' + source[i])
# 4. 用jieba库中的cut()函数进行分词
words = jieba.cut(title_all) #传入的就是上面汇总的title_all
# 5.通过for循环语句提取长度大于等于两个字的词
report_words = []
for word in words:
if len(word) == 2:
report_words.append(word)
#print(report_words)
# 6.获取词频最高的50个词
result = Counter(report_words).most_common(50)
#print(result)
# 再去掉空格和中文括号里的内容(46和47行代码)
# 1.获取真实的网页源代码
import time
from selenium import webdriver
def get_browser():
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver
#if __name__ == '__main__':
url = 'https://s.weibo.com/weibo?q=阿里巴巴'
browser = get_browser()
browser.get(url)
time.sleep(6)
data = browser.page_source
#print(data)
# 2.使用正则表达式提取数据
import re
p_source = '
source = re.findall(p_source,data)
p_title = '(.*?)
'
title = re.findall(p_title,data,re.S)
# print(title)
# print(source)
# 3.数据清洗、打印输出、汇总
title_all = '' #创建一个空字符串,用于汇总数据
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>','',title[i])
title[i] = re.sub('[\u200b]','',title[i])
title[i] = re.sub('(.*?)','',title[i])
title[i] = re.sub(' ','',title[i])
title_all = title_all + title[i] #通过字符串拼接汇总数据
print(str(i + 1) + '.' + title[i] + '-' + source[i])
# 4. 用jieba库中的cut()函数进行分词
import jieba
words = jieba.cut(title_all) #传入的就是上面汇总的title_all
# 5.通过for循环语句提取长度大于等于两个字的词
report_words = []
for word in words:
if len(word) == 2:
report_words.append(word)
#print(report_words)
# 6.获取词频最高的50个词
from collections import Counter
result = Counter(report_words).most_common(50)
#print(result)
# 7.按特定形状和特定颜色绘制词云图
# (1)获取词云图形状参数mask
from PIL import Image
import numpy as np
from wordcloud import WordCloud #导入相关库
background_pic = '微博.jpg' #形状蒙版图片转换为数值数组
images = Image.open(background_pic) #打开形状蒙版图片
maskImages = np.array(images) #将形状蒙版图片转换为数值数组
# (2)按照形状蒙版绘制词云图
content = ' '.join(report_words) #把列表转换为字符串
wc = WordCloud(font_path='simhei.ttf',#字体文件路径(这里为黑体)
background_color='white',#背景颜色(这里为白色)
width=1000,#宽度
height=600,#高度
mask = maskImages #应用形状蒙版
).generate(content) #绘制词云图
# (3)修改词云图的底层颜色,这个blackgroud_pic就是之前的背景图片
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
back_color = imread(background_pic) #读取图片
image_colors = ImageColorGenerator(back_color) #获取图片的颜色
wc.recolor(color_func=image_colors) #为词云图上色
wc.to_file('微博内容词云图.png') #导出成PNG格式图片(使用相对路径)