1.pd.read_csv读取csv
baidu = pd.read_csv(
'data/content.csv',
parse_dates={"s":['date','time']}, # 合并时间列
infer_datetime_format=True, # 可显著减少read_csv命令日期解析时间
keep_date_col=True # 是否保留原列
)
2.判断DataFrame 类型:
if isinstance(测试数据, pd.DataFrame):
print(“True”)
3.Series把合并到Dataframe中
content2 = Data.join(Seri).dropna()
content2.head()并去除空值
4.删除列
content3 = content2.drop('百度指数', axis=1)‘
5.添加列
content3['baidu_index'] = content2['百度指数'].astype(np.int)
6.存入csv数据
content3.to_csv('data/content3.csv')
7.分组、计数、排序(由大到小)
a = b.groupby('c').size().sort_values(ascending=False)
8.专项添加列数据
a3.loc[a3['score'].isin['蚕豆网','九游网','爪游控','手机游戏网'],period]=1
9.专列去除缺失值
a4= a3.dropna().drop('period', axis=1)
10.交叉表计算排名
best = pd.crosstab(content4.source, content4.period)
# 去掉发帖量为0的数据
b1 = best[1][best[1] > 0].sort_values(ascending=False)
11.柱状图显示
b1.plot.bar()
12.可视化
plt.figure(figsize=(18, 10))
plt.bar(baidu_index.index, baidu_index/30000, alpha=0.2, label='百度指数')
period1Count.plot(label='前期:内行')
period2Count.plot(label='中期:联系员')
period3Count.plot(label='后期:推销员')
# 标识三个时期的典型特征
plt.scatter(['2018-01-23', '2018-01-24', '2018-01-25'], [24, 16, 8], s=6000, alpha=0.2, color='r')
plt.xticks(baidu_index.index)
plt.legend()
plt.grid(linewidth=0.2, alpha=0.3)
plt.show()
13.分词
# 导入
import jieba
import jieba.analyse
# 自定义词典
with open('jieba/custom.txt', 'r', encoding='utf-8') as f:
b = f.read()
b = b.split('\n')
jieba.load_userdict(b) # 应用自定义词典
# 切分词
period1F = jieba.lcut(period1Title)
# 去停用词
def stopw(period):
# 载入停用词文件
stopword = []
with open('jieba/stopword.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
l = line.strip()
if l == '\\n':
l = '\n'
if l == '\\u3000':
l = '\u3000'
stopword.append(l)
# x的元素是否包含于y
x = np.array([1, 2])
y = np.array([1, 2, 4, 5, 6, 8])
z = x[~np.in1d(z,y)] # 返回去除y中的x元素 [4,5,6,8]
# 使用列表生成式实现
k = [i for i in z if len(i) > 1]
# 计算词频并排序
x1 = pd.DataFrame(stopw(period1F))
periodIC = pd.DataFrame(x1.group(0).size().sort_values(ascending=False))
14.词云绘制
from wordcloud import WordCloud
from PIL import Image
# 读取背景图
alice_mask = np.array(Image.open("jieba/timg.jpg"))
# 绘制词云
wordcloud = WordCloud(
background_color = 'white',
max_words=30,
font_path="jieba/arial unicode ms.ttf",
stopwords = stopword,
mask = alice_mask
).generate(文本数据)
plt.figure(figsize=(18, 10),dpi=300)
plt.imshow(wordcloud, interpolation='bilinear') # 绘制数据内的图片,双线性插值绘图
plt.axis("off") # 去掉坐标轴