1 准备工作
- 备注微信好友真实姓名。
- 通过Python的itchat库爬取所有微信好友信息。代码如下:
#-*- coding:utf-8 -*-
"""
-----------------------------------
版本:Python3.6.1
-----------------------------------
"""
import io
# 导入itchat包
import itchat
def main():
# 使用二维码登陆(括号内参数可以保持一段时间登录状态)
itchat.auto_login(hotReload=True)
# 打印出好友列表
friends = itchat.get_friends(update=True)
# print(friends)
with io.open('C:/Users/Admin/Downloads/friend.txt', 'a', encoding='utf-8') as f:
f.write(str(friends))
if __name__ == '__main__':
main()
2 文本整理
2.1 读取文本
library(pacman)
p_load(data.table,tidyr,stringr,dplyr)
df <- fread('C:/Users/Admin/Downloads/friend.txt',header = F,
stringsAsFactors = F,encoding = "UTF-8",sep = "{")
# 查看数据框结构
str(df)
# 去掉第1列和第2列,第1列没有数据,第2列是自己,信息格式与其他列不一致
df <- df[,3:692]
2.2 将有用信息清洗成原始素材
# 行列转置,宽转长
df2 <- gather(df,key="id",value="info")
# 将value列分裂为多列
df3 <- separate(df2,info,sep=",",into = c('Uin','UserName','NickName','HeadImgUrl','ContactFlag',
'MemberCount','MemberList','RemarkName','HideInputBarFlag','Sex',
'Signature','VerifyFlag','OwnerUin','PYInitial','PYQuanPin',
'RemarkPYInitial','RemarkPYQuanPin','StarFriend','AppAccountFlag','Statues',
'AttrStatus','Province','City','Alias','SnsFlag',
'UniFriend','DisplayName','ChatRoomId','KeyWord','EncryChatRoomId','IsOwner'))
# 选择需要的列
need <- c('id','NickName','RemarkName','Sex','Signature','Province','City')
df4 <- df3 %>% select(need)
# 选取每列中的有效信息
df4$NickName <- df4$NickName %>% str_sub(.,start = 15L,end = -2L)
df4$RemarkName <- df4$RemarkName %>% str_sub(.,start = 17L,end = -2L)
df4$Sex <- df4$Sex %>% str_sub(.,start = 10L,end = -2L)
df4$Signature <-df4$Signature %>% str_sub(.,start = 16L,end = -2L)
df4$Province <-df4$Province %>% str_sub(.,start = 15L,end = -2L)
df4$City <-df4$City %>% str_sub(.,start = 11L,end = -2L)
# 将初步清洗的文本写入文件
write.csv(df4,"C:/Users/Admin/Downloads/friend.zl.csv")
2.3 整理MEM班级同学签名
# 读取文件
df5 <- read.csv("C:/Users/Admin/Downloads/friend.zl.csv",header = T,stringsAsFactors = F)
# 如果RemarkName为空,用NickName替代
ifelse(df5$RemarkName == "",df5$NickName,df5$RemarkName)
# 检测一下RemarkName是否还有缺失值
n <- sum(is.na(df5$RemarkName)); n
## [1] 0
# 读取班级人员名单
df.name <- read.csv("./mem.names.csv",header = T,stringsAsFactors = F)
# 选取班级人员的签名
df6 <- left_join(df.name,df5,by = c(`姓名` = "RemarkName"))
# 选取微信签名,去掉无意义的字符,并拼接为一个文本
txt <- df6$Signature %>% str_trim(.) %>% paste(.,collapse = " ") %>% str_remove_all(.,"NA|16L");txt
## [1] "不以物喜,不以己悲。 Maybe the fault does not lie in the way but in the choice.
一往无前虎山行 尽头在哪儿呢?在学习中!! 一手烂牌是运气,打好烂牌是本事。
∞ The princess is so cool 一枚新时代宝藏硬核女战士 阿耨多罗三藐三菩提心 Talk is chea
淡泊明志 寧靜致遠 人生真味-淡,人生风度-忘。 哪些是科学,哪些是魔术,哪些是信仰
我的征途是星辰大海 君子坦荡荡~~ 人生绝非一场消遣。 学习是一种信仰 stay hungry stay foolish
能在艰苦中成长更需要一份坚决的魄力 "
3 分词
p_load(jiebaR)
# 新建分词引擎,加载停用词词典,去除“的得地数字标点符号等”无意义的停用词
wk <- worker(stop_word = "./dict/characters-master/stop_words")
# 使用默认引擎分词
txt1 <- segment(txt,wk);txt1
## [1] "不以" "物喜" "不以己" "悲" "Maybe" "fault" "lie" "choice" "一往无前" "虎山行" "尽头" "学习" "中" "一手" "烂牌"
## [16] "运气" "好烂" "牌" "本事" "The" "princess" "cool" "一枚" "新" "时代" "宝藏" "硬核" "女战士" "耨" "多罗"
## [31] "三" "藐三" "菩提" "心" "Talk" "chea" "淡泊明志" "寧靜致遠" "人生" "真味" "淡" "人生" "风度" "忘" "科学"
## [46] "魔术" "信仰" "征途" "星辰" "大海" "君子" "坦荡荡" "人生" "一场" "消遣" "学习" "一种" "信仰" "stay" "hungry"
## [61] "stay" "foolish" "艰苦" "中" "成长" "更" "一份" "魄力"
可以看到,有些词分得有问题,手动将它们加进去。
# 添加新词到分词器,重新分词
new_user_word(wk,"不以物喜")
new_user_word(wk,"不以己悲")
new_user_word(wk,"烂牌")
new_user_word(wk,"阿耨多罗三藐三菩提心")
重新分词:
# 重新分词
txt2 <- segment(txt,wk);txt2
## [1] "不以物喜" "不以己悲" "Maybe" "fault" "lie" "choice" "一往无前"
## [8] "虎山行" "尽头" "学习" "中" "一手" "烂牌" "运气"
## [15] "打好" "烂牌" "本事" "The" "princess" "cool" "一枚"
## [22] "新" "时代" "宝藏" "硬核" "女战士" "阿耨多罗三藐三菩提心" "Talk"
## [29] "chea" "淡泊明志" "寧靜致遠" "人生" "真味" "淡" "人生"
## [36] "风度" "忘" "科学" "魔术" "信仰" "征途" "星辰"
## [43] "大海" "君子" "坦荡荡" "人生" "一场" "消遣" "学习"
## [50] "一种" "信仰" "stay" "hungry" "stay" "foolish" "艰苦"
## [57] "中" "成长" "更" "一份" "魄力"
统计词频:
freq <- freq(txt2)
4 词云图
p_load(wordcloud2)
# 图一
wordcloud2(freq, size = 0.5, fontFamily = "微软雅黑",
color = "random-light", backgroundColor = "grey")
# 图二
wordcloud2(freq, size = 0.5, minRotation = -pi/2, maxRotation = -pi/2)
# 图三
wordcloud2(freq, size = 0.5, minRotation = -pi/6, maxRotation = -pi/6,
rotateRatio = 1)
# 图四
#无法在自定义图片中绘制词云,是wordcloud2最新版本的BUG,暂时还没有解决办法,推荐卸载最新版本,安装0.2.0的旧版本,具体步骤如下:
#1、卸载现有的wordcloud2包:remove.packages("wordcloud2");
#2、下载旧版本的wordcloud2包;
#下载地址:Index of /src/contrib/Archive/wordcloud23、手动安装tar.gz文件
#图片放到该文件夹下:C:\Users\Admin\Documents\R\win-library\3.6\wordcloud2\examples(前面替换为R安装路径)
tu = system.file("examples/t.png",package = "wordcloud2")
wordcloud2(freq, figPath = tu, size = 0.5,color = "black")
# 图五
# 因为词太少,所以选简单一点的图片和文字
letterCloud(freq, word = "王", wordSize = 0.5,color = 'random-dark',backgroundColor = "snow")
偶尔能画出来,但现在画不出来
图四和图五的功能还存在BUG。