#字频统计
# -*- coding: utf-8 -*-
# coding=utf-8
import collections
# 读取文本文件,把所有的汉字拆成一个list
f = open("D:\python\pra\推荐系统1-500.txt", 'r', encoding='utf-8') # 打开文件,并读取要处理的大段文字
txt1 = f.read()
txt1 = txt1.replace('\n', '') # 删掉换行符
txt1 = txt1.replace(',', '') # 删掉逗号
txt1 = txt1.replace('。', '') # 删掉句号
mylist = list(txt1)
mycount = collections.Counter(mylist)
for key, val in mycount.most_common(50): # 有序(返回前10个)
print(key, val)
#词频统计
# -*- coding: utf-8 -*-
# coding=utf-8
import jieba
import jieba.analyse
# text = "故宫的著名景点包括乾清宫、太和殿和午门等。其中乾清宫非常精美,午门是紫禁城的正门,午门居中向阳。"
text = ''
#jieba.load_userdict("jieba_dict.txt") # 用户自定义词典 (用户可以自己在这个文本文件中,写好自定制词汇)
f = open('D:\python\pra\推荐系统1-500.txt', 'r', encoding='utf8') # 要进行分词处理的文本文件 (统统按照utf8文件去处理,省得麻烦)
lines = f.readlines()
for line in lines:
text += line
# seg_list = jieba.cut(text, cut_all=False) #精确模式(默认是精确模式)
seg_list = jieba.cut(text) # 精确模式(默认是精确模式)
print("[精确模式]: ", "/ ".join(seg_list))
# seg_list2 = jieba.cut(text, cut_all=True) #全模式
# print("[全模式]: ", "/ ".join(seg_list2))
# seg_list3 = jieba.cut_for_search(text) #搜索引擎模式
# print("[搜索引擎模式]: ","/ ".join(seg_list3))
tags = jieba.analyse.extract_tags(text, topK=5)
print("关键词: ", " / ".join(tags))
# -*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
#设置pd的显示长度
pd.set_option('max_colwidth',500)
# 载入数据
rows = pd.read_csv('推荐系统1001-1500.csv', header=0,encoding='utf-8',dtype=str)
segments = []
for index, row in rows.iterrows():
content = row[2]
#TextRank 关键词抽取,只获取固定词性
words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
splitedStr = ''
for word in words:
# 记录全局分词
segments.append({'word':word, 'count':1})
splitedStr += word + ' '
dfSg = pd.DataFrame(segments)
# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')
文件读取方式
#定义一个空字符串
final = ""
#文件夹位置
filename = r"D:\python\pra\个性化推荐-综述类文献.txt"
#打开文件夹,读取内容,并进行分词
with open(filename,'r',encoding = 'utf-8') as f:
for line in f.readlines():
word = jieba.cut(line)
for i in word:
final = final + i +" "