'''访问数据库'''
import pandas as pd
from sqlalchemy import create_engine
# 每次读取数据库都要运行一次
engine = create_engine('mysql+pymysql://root:yeswedid631@localhost:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize=10000)
data = pd.concat([i for i in sql])
data.head()
realIP | realAreacode | userAgent | userOS | userID | clientID | timestamp | timestamp_format | pagePath | ymd | ... | fullURLId | hostname | pageTitle | pageTitleCategoryId | pageTitleCategoryName | pageTitleKw | fullReferrer | fullReferrerURL | organicKeyword | source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2683657840 | 140100 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... | Windows XP | 785022225.1422973265 | 785022225.1422973265 | 1422973268278 | 2015-02-03 22:21:08 | /info/hunyin/hunyinfagui/201404102884290_6.html | 20150203 | ... | 107001 | www.lawtime.cn | 广东省人口与计划生育条例全文2014 - 法律快车婚姻法 | 31 | 故意伤害 | 计划生育 | None | None | None | None |
1 | 973705742 | 140100 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | Windows 7 | 2048326726.1422973286 | 2048326726.1422973286 | 1422973268308 | 2015-02-03 22:21:08 | /ask/exp/17199.html | 20150203 | ... | 1999001 | www.lawtime.cn | 非广州户籍人员可以在广州申请出入境证件吗? - 法律快车法律经验 | 20 | 劳资纠纷 | 出入境 | baidu | http://www.baidu.com/s?wd=%E9%9D%9E%E5%B9%BF%E... | 非广州户籍人员怎么申请预约出入境 | baidu |
2 | 3104681075 | 140100 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... | Windows XP | 1639801603.1422973278 | 1639801603.1422973278 | 1422973277375 | 2015-02-03 22:21:17 | /ask/question_3893276.html | 20150203 | ... | 101003 | www.lawtime.cn | 汽车碰撞自行车或两轮摩托车精确碰撞点的确定方法 - 法律快车法律咨询 | 26 | 定罪量刑 | 法律咨询 | www.haosou.com/s | http://www.haosou.com/s?psid=e79d0155bed18bf4b... | None | www.haosou.com |
3 | 308351962 | 140106 | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | Windows XP | 1597050740.1422973305 | 1597050740.1422973305 | 1422973282739 | 2015-02-03 22:21:22 | /ask/question_5281741.html | 20150203 | ... | 101003 | www.lawtime.cn | 交通事故销案后不满意赔偿可以重新立案吗 - 法律快车法律咨询 | 12 | 伤害赔偿 | 法律咨询 | baidu | http://www.baidu.com/s?word=%E4%BA%A4%E9%80%9A... | 交通事故赔偿后交警要销案吗 | baidu |
4 | 2683657840 | 140100 | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... | Windows XP | 785022225.1422973265 | 785022225.1422973265 | 1422973290048 | 2015-02-03 22:21:30 | /info/hunyin/hunyinfagui/201404102884290_5.html | 20150203 | ... | 107001 | www.lawtime.cn | 广东省人口与计划生育条例全文2014 - 法律快车婚姻法 | 31 | 故意伤害 | 计划生育 | None | None | None | None |
5 rows × 21 columns
1. 网页类型分析:由结果可见最多的为 101 类型(咨询相关),其次为 199 类型(其他), 然后为 107(知识相关)
# 定义 DataFrame 索引重建和百分比计算函数
def process(df, col, index):
df.columns = [col]
df.index.name = index
df['percent'] = df[col] / df[col].sum()
return df
data['type'] = data['fullURLId'].apply(lambda x: x[:3])
counts = pd.DataFrame(data['type'].value_counts())
counts = process(counts, 'num', 'type')
counts
num | percent | |
---|---|---|
type | ||
101 | 411665 | 0.491570 |
199 | 201426 | 0.240523 |
107 | 182900 | 0.218401 |
301 | 18430 | 0.022007 |
102 | 17357 | 0.020726 |
106 | 3957 | 0.004725 |
103 | 1715 | 0.002048 |
2. 咨询类别内部统计:96% 为 101003(咨询内容页),其次为 101002(咨询列表页)
counts_101 = pd.DataFrame(data['fullURLId'][data['type'] == '101'].value_counts())
counts_101 = process(counts_101, 'num', 'fullURLId')
counts_101
num | percent | |
---|---|---|
fullURLId | ||
101003 | 396612 | 0.963434 |
101002 | 7776 | 0.018889 |
101001 | 5603 | 0.013611 |
101009 | 854 | 0.002075 |
101008 | 378 | 0.000918 |
101007 | 147 | 0.000357 |
101004 | 125 | 0.000304 |
101006 | 107 | 0.000260 |
101005 | 63 | 0.000153 |
3. 知识类别内部统计:107 内部只有一种类型,需根据网址对其进行分类,共 3 类,知识内容页,知识首页,知识列表页。由结果可知 90% 都是知识内容页
counts_107 = pd.DataFrame(data['fullURLId'][data['type'] == '107'].value_counts())
counts_107 = process(counts_107, 'num', 'fullURLId')
counts_107
num | percent | |
---|---|---|
fullURLId | ||
107001 | 182900 | 1.0 |
j = data[['fullURL']][data['type'] == '107'].copy()
j['type'] = None
j['type'][j['fullURL'].str.contains('info/.+?/')] = '知识首页'
j['type'][j['fullURL'].str.contains('info/.+?/.+?')] = '知识列表页'
j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')] = '知识内容页'
j['type'].fillna('其他', inplace=True)
c107u = pd.DataFrame(j['type'].value_counts())
c107u = process(c107u, 'num', '网页类型')
c107u
num | percent | |
---|---|---|
网页类型 | ||
知识内容页 | 164243 | 0.897993 |
知识列表页 | 9656 | 0.052794 |
知识首页 | 9001 | 0.049213 |
4. 其他类型统计:包含很多法规专题、咨询类以及带 ?的内容
j = data[['fullURL']][data['type'] == '199'].copy()
j['type'] = None
j['type'][j['fullURL'].str.contains('\?')] = '带?'
j['type'][j['fullURL'].str.contains('/ask/')] = '其他咨询'
j['type'][j['fullURL'].str.contains('/faguizt/')] = '法规专题'
j['type'][j['fullURL'].str.contains('lawfirm')] = '律师事务所'
j['type'][j['fullURL'].str.contains('midques')] = '中间类型'
j['type'].fillna('其他', inplace=True)
c199u = pd.DataFrame(j['type'].value_counts())
c199u = process(c199u, 'num', '网页类型')
c199u
num | percent | |
---|---|---|
网页类型 | ||
其他 | 55450 | 0.275287 |
带? | 53283 | 0.264529 |
法规专题 | 47403 | 0.235337 |
其他咨询 | 39415 | 0.195680 |
律师事务所 | 3839 | 0.019059 |
中间类型 | 2036 | 0.010108 |
5. 带 ?的统计:98% 为 1999001,需进一步分析其类型
# 带 ? 统计
j = data['fullURLId'][data['fullURL'].str.contains('\?')].value_counts()
c_q = pd.DataFrame(j)
c_q = process(c_q, 'num', 'fullURLId')
c_q
num | percent | |
---|---|---|
fullURLId | ||
1999001 | 64718 | 0.988182 |
301001 | 356 | 0.005436 |
107001 | 346 | 0.005283 |
101003 | 47 | 0.000718 |
102002 | 25 | 0.000382 |
6. 带 ?的 1999001 统计:主要为律师助手、发布类等内容
# 带 ? 的 1999001 进一步分类
j = data[['pageTitle', 'fullURL']][data['fullURL'].str.contains('\?') & (data['fullURLId'] == '1999001')]
j['type'] = None
j['type'][j['pageTitle'].str.contains('快车-律师助手')] = '快车-律师助手'
j['type'][j['pageTitle'].str.contains('免费发布法律咨询')] = '免费发布法律咨询'
j['type'][j['pageTitle'].str.contains('咨询发布成功')] = '咨询发布成功'
j['type'][j['pageTitle'].str.contains('快搜')] = '快搜'
j['type'].fillna('其他', inplace=True)
c199q = pd.DataFrame(j['type'].value_counts())
c199q = process(c199q, 'num', '网页类型')
c199q
num | percent | |
---|---|---|
网页类型 | ||
快车-律师助手 | 49894 | 0.770945 |
免费发布法律咨询 | 6166 | 0.095275 |
咨询发布成功 | 5220 | 0.080658 |
快搜 | 1943 | 0.030023 |
其他 | 1495 | 0.023100 |
7. 瞎逛用户统计:即网页后缀不包含.html的
# 瞎逛用户(网页后缀不包含.html)
j = data['type'][data['fullURL'].str.contains('\.html') == False].value_counts()
cxg = pd.DataFrame(j)
cxg = process(cxg, 'num', '网页类型')
cxg
num | percent | |
---|---|---|
网页类型 | ||
199 | 117124 | 0.712307 |
107 | 17843 | 0.108515 |
102 | 17357 | 0.105559 |
101 | 7130 | 0.043362 |
106 | 3957 | 0.024065 |
301 | 1018 | 0.006191 |
8. 点击次数分析
click = pd.DataFrame(data['realIP'].value_counts())
click.columns = ['点击次数']
click['用户数'] = 1
clcnt = click.groupby('点击次数').sum()
clcnt['用户百分比'] = clcnt['用户数'] / clcnt['用户数'].sum()
clcnt['记录百分比'] = clcnt['用户数'] * clcnt.index / len(data)
clcnt2 = clcnt.iloc[:7,:].T
clcnt2['>7'] = clcnt.iloc[7:,:].apply(sum)
clcnt2 = clcnt2.T
clcnt2
用户数 | 用户百分比 | 记录百分比 | |
---|---|---|---|
点击次数 | |||
1 | 132119.0 | 0.574059 | 0.157763 |
2 | 44175.0 | 0.191941 | 0.105499 |
3 | 17573.0 | 0.076355 | 0.062952 |
4 | 10156.0 | 0.044128 | 0.048509 |
5 | 5952.0 | 0.025862 | 0.035536 |
6 | 4132.0 | 0.017954 | 0.029604 |
7 | 2632.0 | 0.011436 | 0.022000 |
>7 | 13410.0 | 0.058267 | 0.538136 |
# 7 次以上进一步分析
clcnt3 = pd.DataFrame()
clcnt3['8~100'] = [clcnt[(clcnt.index >=8) & (clcnt.index <= 100)]['用户数'].sum()]
clcnt3['100~1000'] = [clcnt[(clcnt.index > 100) & (clcnt.index <= 1000)]['用户数'].sum()]
clcnt3['>1000'] = [clcnt[clcnt.index > 1000]['用户数'].sum()]
clcnt3 = clcnt3.T
clcnt3.columns = ['用户数']
clcnt3
用户数 | |
---|---|
8~100 | 12952 |
100~1000 | 439 |
>1000 | 19 |
# 浏览一次用户浏览网页类型
one = click[click['点击次数'] == 1]
one.drop('用户数', inplace=True, axis=1)
one = pd.merge(one, data[['fullURL', 'fullURLId', 'realIP']], left_index=True, right_on='realIP', how='inner')
one['fullURLId'].value_counts()[:10]
101003 102560
107001 19443
1999001 9381
301001 515
102001 70
103003 45
101002 33
101001 28
102002 13
106001 13
Name: fullURLId, dtype: int64
9. 网页排名:统计以 .html 为结尾的网页点击率排名。另还统计了 101/107 网页的点击人数和点击次数,从而对比其平均点击率。
html = data[data['fullURL'].str.contains('\.html')]
html['fullURL'].value_counts()
http://www.lawtime.cn/faguizt/23.html 6503
http://www.lawtime.cn/info/hunyin/lhlawlhxy/20110707137693.html 4938
http://www.lawtime.cn/faguizt/9.html 4562
http://www.lawtime.cn/info/shuifa/slb/2012111978933.html 4495
http://www.lawtime.cn/faguizt/11.html 3976
...
http://law.lawtime.cn/d383850388944_2_p1.html 1
http://www.lawtime.cn/ask/question_1090226.html 1
http://www.lawtime.cn/ask/question_5224562.html 1
http://www.lawtime.cn/ask/question_8514045.html 1
http://www.lawtime.cn/info/jiaotong/jtlawjtxgfg/2010122779762.html 1
Name: fullURL, Length: 293562, dtype: int64
# 107/101 点击次数统计
top2 = data[(data['type'] == '107') | (data['type'] == '101')][['type', 'realIP']]
top2.groupby('type').count()
realIP | |
---|---|
type | |
101 | 411665 |
107 | 182900 |
# 107/101 点击人数统计
top2.groupby('type')['realIP'].value_counts().unstack().T.count()
type
101 176407
107 56340
dtype: int64
html2 = data[data['fullURL'].str.contains('\d+_\d+\.html')].reset_index()
html2['fullURL'].value_counts()
http://www.lawtime.cn/info/hunyin/lhlawlhxy/20110707137693_2.html 3305
http://www.lawtime.cn/info/shuifa/slb/2012111978933_2.html 2161
http://www.lawtime.cn/info/minshi/fagui/2013051382463_4.html 653
http://www.lawtime.cn/info/hunyin/hunyinfagui/201411053308986_2.html 440
http://www.lawtime.cn/info/jiaotong/jtlawjtxgfg/201411273309942_3.html 377
...
http://www.lawtime.cn/info/hetong/weiyuezeren/2010111175876_2.html 1
http://www.lawtime.cn/info/laodonghetongfa/jiedu/2008101130835_2.html 1
http://www.lawtime.cn/info/laodong/ldzygjfg/20110518102781_5.html 1
http://www.lawtime.cn/info/hetong/clht/20110302122331_20.html 1
http://www.lawtime.cn/info/laodong/gssgpc/2007020719219_2.html 1
Name: fullURL, Length: 13857, dtype: int64
mainsite = html2['fullURL'].drop_duplicates().apply(lambda x: x.split('_')[0])
mainsite.value_counts()
http://www.lawtime.cn/askzt/listview 63
http://www.lawtime.cn/info/hetong/htfalv/201312182875586 31
http://www.lawtime.cn/info/xingshisusongfa/falvfagui/201412113310776 29
http://www.lawtime.cn/info/hehuo/falvguiding/201012243286 25
http://www.lawtime.cn/info/xingfa/xingfaquanwenjiedu/20110408114513 24
..
http://www.lawtime.cn/zhishiku/jingjizhongcai/lvshi/2094 1
http://www.lawtime.cn/info/xingfa/feifaxingyizui/20150408/3316284 1
http://www.lawtime.cn/info/shipin/info/shipin/dongtai/20110924184481 1
http://www.lawtime.cn/info/jiaotong/jtpcbz/201405072887732 1
http://www.lawtime.cn/info/fangdichan/fangchanshui/201401032876928 1
Name: fullURL, Length: 8841, dtype: int64
1. 数据清洗:根据上面的数据探索结果,删除无用的数据以及重复的数据
engine = create_engine('mysql+pymysql://root:yeswedid631@localhost:3306/test?charset=utf8mb4')
sql = pd.read_sql('all_gzdata', engine, chunksize=10000)
data = pd.concat([i for i in sql]) # len(data) = 837450
# 删除中间类型网页
rule1 = data['fullURL'].str.contains('midques_') == False
data = data[rule1] # len(data) = 835414
# 删除律师的浏览信息(标题包含 '快车-律师助手')
rule2 = data['pageTitle'].str.contains('快车-律师助手') == False
data = data[rule2] # len(data) = 782535
# 删除咨询发布成功
rule3 = data['pageTitle'].str.contains('咨询发布成功') == False
data = data[rule3] # len(data) = 777315
# 删除快搜
rule4 = data['pageTitle'].str.contains('快搜') == False
data = data[rule4] # len(data) = 775302
# 删除免费发布法律咨询
rule5 = data['pageTitle'].str.contains('免费发布法律咨询') == False
data = data[rule5] # len(data) = 765711
# 删除主网址不包含关键字的网页
data['fullURL'] = data['fullURL'].str.replace('\?.*','') # 删除 ? 后面的内容,因为大部分 ? 前为原类型
rule6 = data['fullURL'].str.contains('lawtime')
data = data[rule6] # len(data) = 765610
# 删除后缀不是.html 的
rule7 = data['fullURL'].str.contains('\.html')
data = data[rule7] # len(data) = 670954
# 删除重复数据
data = data.drop_duplicates() # len(data) = 669943
# 保存数据至数据库
data.to_sql('cleaned_gzdata', engine, index=False, if_exists='append')
2. 数据变换:去除翻页重复的网页,手动分类咨询与知识类别
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:yeswedid631@localhost:3306/test?charset=utf8mb4')
sql = pd.read_sql('cleaned_gzdata', engine, chunksize=10000)
data = pd.concat([i for i in sql]) # len(data) = 669943
# 去除翻页重复的网页
data['fullURL'] = data['fullURL'].str.replace('_\d{0,2}.html', '.html')
data = data.drop_duplicates(['fullURL','userID']) # len(data) = 534713
# 手动分类咨询与知识类别
data = data.copy()
data['type'] = 'else'
data['type'][data['fullURL'].str.contains('(ask)|(askzt)')] = 'ask'
data['type'][data['fullURL'].str.contains('(info)|(zhishi)|(faguizt)')] = 'info'
3. 属性规约:以婚姻数据为例,只保留用户和网页数据,并且删除掉只被点击了 3 次以下的数据
# 提取知识中的婚姻数据,并只保留用户、网页列
data2 = data[data['type'] == 'info']
data2 = data2[data2['fullURL'].str.contains('/hunyin/')]
data2 = data2[['realIP', 'fullURL']] # len(data2) = 16884
# 提取点击次数 3 次及以上的数据
cnt = data2['fullURL'].value_counts()
data2 = pd.merge(data2, cnt, left_on='fullURL', right_index=True)
data2 = data2[data2['fullURL_y'] >= 3]
data2 = data2.drop('fullURL_x', axis=1)
data2.columns = ['网址', '用户', '点击次数'] # len(data2) = 12737
1. 协同过滤:计算各网址间的相似度矩阵,使用杰拉德相似系数函数。推荐相似度高的网址给对应的用户
'''定义协同过滤算法模型'''
import numpy as np
def jaccard(a, b): # 自定义杰拉德相似系数函数,仅对 0-1 矩阵有效
return 1.0 * (a * b).sum() / (a + b - a * b).sum()
class Recommender(): # 定义推荐模型
sim = None # 相似度矩阵
def similarity(self, x, distance): # 计算相似度矩阵方法
y = np.ones((len(x), len(x)))
for i in range(len(x)):
for j in range(len(x)):
y[i, j] = distance(x[i], x[j])
return y
def fit(self, x, distance=jaccard): # 定义训练方法
self.sim = self.similarity(x, distance)
return self.sim
def recommend(self, a): # 推荐方法
return np.dot(self.sim, a) * (1 - a) # 1-a 的目的是过滤用户已点击过的内容
# 建立 0-1 矩阵
ones = pd.crosstab(data2['用户'], data2['网址']) # 使用 pandas 的交叉表函数,默认统计方法为计数
ones[ones > 0] = 1
# 打乱数据,得到训练集和测试集
p = np.random.permutation(len(ones))
ones = ones.take(p)# 打乱数据
train = ones.iloc[:int(len(values) * 0.9),:]
test = ones.iloc[int(len(values) * 0.9):,:]
# 根据协同过滤模型,需进行转置
train_d = train.values.T # train_d.shape = (856, 7882)
test_d = test.values.T # test_d.shape = (856, 876)
# 训练模型,得到相似度矩阵
rc = Recommender()
sim = rc.fit(train_d)
sim_df = pd.DataFrame(sim)
sim_df.index = train.columns
sim_df.columns = train.columns
sim_df.head()
网址 | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010091750425.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010120179468.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110607134263.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110908150795.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20111118161114.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20120112162743.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/201407083018858.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/201503133315426.html | http://www.lawtime.cn/info/hunyin/caichanfengexieyi/201503133315428.html | http://www.lawtime.cn/info/hunyin/caichangongzheng/2010102668596.html | ... | http://www.lawtime.cn/info/hunyin/znfylawfyq/20120807164892.html | http://www.lawtime.cn/info/hunyin/znfylawfyq/201401062877110.html | http://www.lawtime.cn/info/hunyin/znfylawfyq/201401062877120.html | http://www.lawtime.cn/info/hunyin/znfylawfyq/201402182880772.html | http://www.lawtime.cn/info/hunyin/znfylawfyq/201402182880784.html | http://www.lawtime.cn/info/hunyin/znfylawfyq/201403312883880.html | http://www.lawtime.cn/zhishiku/hunyin/info/1770.html | http://www.lawtime.cn/zhishiku/hunyin/law/1770.html | http://www.lawtime.cn/zhishiku/hunyin/lvshi/1770.html | http://www.lawtime.cn/zhishiku/hunyin/zixun/1770.html |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
网址 | |||||||||||||||||||||
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010091750425.html | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.125 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010120179468.html | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110607134263.html | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110908150795.html | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20111118161114.html | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 856 columns
# 计算推荐结果,输入参数为测试集
result = pd.DataFrame(rc.recommend(test_d))
result.index = test.columns
result.columns = test.index
result.head()
用户 | 1969390199 | 1925548251 | 474622523 | 1301834865 | 3629127438 | 2965930871 | 3947590926 | 1205624432 | 3629613687 | 308839031 | ... | 1996242490 | 2412541809 | 4260338190 | 1734551867 | 3531176462 | 1872397175 | 176046007 | 1163400658 | 358904846 | 1413636208 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
网址 | |||||||||||||||||||||
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010091750425.html | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/2010120179468.html | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.002288 | 0.0 | 0.0 | 0.0 | 0.013514 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.002288 | 0.002288 | 0.0 | 0.000000 | 0.000000 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110607134263.html | 0.001624 | 0.001624 | 0.000000 | 0.0 | 0.001624 | 0.002217 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.002217 | 0.002217 | 0.0 | 0.001624 | 0.000000 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20110908150795.html | 0.000326 | 0.000326 | 0.000000 | 0.0 | 0.000326 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000326 | 0.000000 | 0.0 |
http://www.lawtime.cn/info/hunyin/caichanfengexieyi/20111118161114.html | 0.000324 | 0.000324 | 0.002381 | 0.0 | 0.000324 | 0.002212 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.002381 | 0.0 | 0.002212 | 0.002212 | 0.0 | 0.000324 | 0.002381 | 0.0 |
5 rows × 876 columns
# 定义协同推荐函数,向用户推荐 K 个网址,这里假定 K=3
def give_result(recom, K=3):
recom.fillna(0.0, inplace=True) # 填充缺失值
recommends = ['推荐'+str(y) for y in range(1, K+1)] # 推荐列名
result = pd.DataFrame([],index = recom.columns, columns = recommends) # 初始化推荐结果
for i in range(len(recom.columns)): # i 为第 i 个用户
temp = recom.sort_values(by = recom.columns[i], ascending = False)
k = 0
while k < K:
result.iloc[i,k] = temp.index[k]
if temp.iloc[k,i] == 0.0: # 推荐度为 0 则从这里开始推荐内容为空
result.iloc[i,k:K] = np.nan
break
k = k+1
return result
recom_result = give_result(result)
recom_result.head()
推荐1 | 推荐2 | 推荐3 | |
---|---|---|---|
用户 | |||
1969390199 | http://www.lawtime.cn/info/hunyin/lihunshouxu/... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... |
1925548251 | http://www.lawtime.cn/info/hunyin/lihunshouxu/... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... |
474622523 | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
1301834865 | http://www.lawtime.cn/info/hunyin/jiehun/hunji... | http://www.lawtime.cn/info/hunyin/jiehun/hunji... | http://www.lawtime.cn/info/hunyin/jiehun/hunji... |
3629127438 | http://www.lawtime.cn/info/hunyin/lihunshouxu/... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... |
2. 随机推荐算法:从用户未点击的网页中随机推荐
# 定义随机推荐函数,向用户推荐 K 个网址,这里假定 K=3
not_click = 1 - test # 得到test集中用户未点击df
not_click = not_click.T
def rand_recom(recom, K=3):
import random
import numpy as np
recommends = ['推荐'+str(y) for y in range(1, K+1)]
result = pd.DataFrame([],index = recom.columns, columns = recommends)
for i in range(len(recom.columns)): # i 为第 i 个用户
current_col = recom.iloc[:, i]
sites = current_col[current_col != 0]
if len(sites) == 0: # 用户没有未点击过的网页
result.iloc[i, :] = None
elif len(sites) < K: # 用户未点击过的网页数小于推荐数
result.iloc[i, :len(sites)] = sites.index
result.iloc[i, len(sites):] = None
else:
result.iloc[i,:] = random.sample(list(sites.index), K)
return result
rand_result = rand_recom(not_click)
rand_result.head()
推荐1 | 推荐2 | 推荐3 | |
---|---|---|---|
用户 | |||
1969390199 | http://www.lawtime.cn/info/hunyin/jiehundengji... | http://www.lawtime.cn/info/hunyin/jclawjicheng... | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... |
1925548251 | http://www.lawtime.cn/info/hunyin/ccfglhccfg/2... | http://www.lawtime.cn/info/hunyin/jiehun/hunji... | http://www.lawtime.cn/info/hunyin/lhlawlhss/20... |
474622523 | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/xueqin/nizhi... | http://www.lawtime.cn/info/hunyin/jichengquans... |
1301834865 | http://www.lawtime.cn/info/hunyin/jihuashengyu... | http://www.lawtime.cn/info/hunyin/yichanfenpei... | http://www.lawtime.cn/info/hunyin/hynews/20150... |
3629127438 | http://www.lawtime.cn/info/hunyin/shanyangyiwu... | http://www.lawtime.cn/info/hunyin/fenji/fenjiz... | http://www.lawtime.cn/info/hunyin/hunyinfagui/... |
3. 根据网页的欢迎度进行推荐,即推荐点击次数前 K 的网页。
# 定义欢迎度推荐函数,向用户推荐 K 个网址,这里假定 K=3
def pop_recom(recom, K=3):
recommends = ['推荐'+str(y) for y in range(1, K+1)]
result = pd.DataFrame([],index = recom.columns, columns = recommends)
for i in range(len(recom.columns)): # i 为第 i 个用户
current_col = recom.iloc[:, i]
sites = current_col[current_col != 0]
if len(sites) == 0: # 用户没有未点击过的网页
result.iloc[i, :] = None
elif len(sites) < K: # 用户未点击过的网页数小于推荐数
result.iloc[i, :len(sites)] = sites.index
result.iloc[i, len(sites):] = None
else:
result.iloc[i,:] = sites.index[:K]
return result
popular_index = test.apply(lambda x: x.sum()).sort_values(ascending=False).index
popular = 1 - test.T.reindex(popular_index)
pop_result = pop_recom(popular)
pop_result.head()
推荐1 | 推荐2 | 推荐3 | |
---|---|---|---|
用户 | |||
1969390199 | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
1925548251 | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
474622523 | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
1301834865 | http://www.lawtime.cn/info/hunyin/lhlawlhxy/20... | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
3629127438 | http://www.lawtime.cn/info/hunyin/hunyinfagui/... | http://www.lawtime.cn/info/hunyin/jihuashengyu... | http://www.lawtime.cn/info/hunyin/jihuashengyu... |
备注:推荐算法中推荐的网址是用户未点击过的网址,因此用户是否喜欢或者说用户是否会浏览该网址不得而知,因此模型评价暂时无法进行,需要得到用户喜好数据集才能进行评价。源代码及数据文件参考:https://github.com/Raymone23/Data-Mining