python字频、词频统计

#字频统计
# -*- coding: utf-8 -*-
# coding=utf-8

import collections

# 读取文本文件,把所有的汉字拆成一个list
f = open("D:\python\pra\推荐系统1-500.txt", 'r', encoding='utf-8')  # 打开文件,并读取要处理的大段文字
txt1 = f.read()
txt1 = txt1.replace('\n', '')  # 删掉换行符
txt1 = txt1.replace(',', '')  # 删掉逗号
txt1 = txt1.replace('。', '')  # 删掉句号
mylist = list(txt1)
mycount = collections.Counter(mylist)
for key, val in mycount.most_common(50):  # 有序(返回前10个)
    print(key, val)

python字频、词频统计_第1张图片

#词频统计
# -*- coding: utf-8 -*-
# coding=utf-8

import jieba
import jieba.analyse

# text = "故宫的著名景点包括乾清宫、太和殿和午门等。其中乾清宫非常精美,午门是紫禁城的正门,午门居中向阳。"
text = ''
#jieba.load_userdict("jieba_dict.txt")  # 用户自定义词典 (用户可以自己在这个文本文件中,写好自定制词汇)
f = open('D:\python\pra\推荐系统1-500.txt', 'r', encoding='utf8')  # 要进行分词处理的文本文件 (统统按照utf8文件去处理,省得麻烦)
lines = f.readlines()
for line in lines:
    text += line

# seg_list = jieba.cut(text, cut_all=False)  #精确模式(默认是精确模式)
seg_list = jieba.cut(text)  # 精确模式(默认是精确模式)
print("[精确模式]: ", "/ ".join(seg_list))

# seg_list2 = jieba.cut(text, cut_all=True)    #全模式
# print("[全模式]: ", "/ ".join(seg_list2))

# seg_list3 = jieba.cut_for_search(text)    #搜索引擎模式
# print("[搜索引擎模式]: ","/ ".join(seg_list3))

tags = jieba.analyse.extract_tags(text, topK=5)
print("关键词:    ", " / ".join(tags))

python字频、词频统计_第2张图片

# -*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs

#设置pd的显示长度
pd.set_option('max_colwidth',500)

# 载入数据
rows = pd.read_csv('推荐系统1001-1500.csv', header=0,encoding='utf-8',dtype=str)

segments = []
for index, row in rows.iterrows():
    content = row[2]
    #TextRank 关键词抽取,只获取固定词性
    words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
    splitedStr = ''
    for word in words:
        # 记录全局分词
        segments.append({'word':word, 'count':1})
        splitedStr += word + ' '
dfSg = pd.DataFrame(segments)

# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')

文件读取方式

#定义一个空字符串
final = ""
#文件夹位置
filename = r"D:\python\pra\个性化推荐-综述类文献.txt"
 
#打开文件夹,读取内容,并进行分词
with open(filename,'r',encoding = 'utf-8') as f:
    for line in f.readlines():
        word = jieba.cut(line)
        for i in word:
            final = final + i +" "

你可能感兴趣的:(python代码,python,词频统计,文本分析)