数据:6个画像,每个画像中有700个标签。
解释:同一标签下不同画像对应的数据,我要对此数据做聚类,
结果展示:
后端代码显示
def kmeans_img(request):
#获取前段数据
UPLOAD_ROOT=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static/img')
brand_name = request.POST.get('brand_name', '').strip()
package_name_arr = request.POST.getlist('package_name_arr[]', [])
label_lists = request.POST.getlist('label_lists[]', [])
font_name =request.POST.get('font_name', '0').strip()
K_value =request.POST.get('K_value', '2').strip()
#局部放大后设置每个轴的最大最小值
x_min =request.POST.get('x_min', None)
x_max =request.POST.get('x_max', None)
y_min =request.POST.get('y_min', None)
y_max =request.POST.get('y_max', None)
font_size_diy=int(request.POST.get('font_size_diy')) if request.POST.get('font_size_diy') else 7
font_color_diy=request.POST.get('font_color_diy') if request.POST.get('font_color_diy') else '000000'
font_color_diy='#'+font_color_diy
point_size_diy=int(request.POST.get('point_size_diy')) if request.POST.get('point_size_diy') else 5
# 挖除的包的id列表
except_packages_list = request.POST.getlist('except_packages_list[]', [])
# 挖除的包的name列表
except_checked_packname_arr = request.POST.getlist('except_checked_packname_arr[]', [])
# 1为重新聚类 0为挖除点
check_loop_k =request.POST.get('check_loop_k', '0').strip()
if check_loop_k is '1':
package_name_arr = list(set(package_name_arr).difference(set(except_packages_list)))
packages =CrowdPerspective.objects.filter(brand=brand_name,package_id__in=package_name_arr,label__in=label_lists).values("package_name","package_id","package_count").order_by("package_name").annotate(total_pages=Count("package_id")).all()
print('*****')
packages_df = pd.DataFrame(list(packages))
print(packages_df)
error_info = ''
temp_sum = []
df_empty = pd.DataFrame({})
# 二级标签数量限制
for k in range(len(packages)):
if packages[k]['package_id'] == 0:
error_info =packages[k]['package_name']+' :人群包缺少包ID异常'
break
Cp = CrowdPerspective.objects.filter(package_id=packages[k]['package_id'],label__in=label_lists).all().values()
temp = []
count = 0
inter_err_info = ''
for data in Cp:
p_data = []
p_show_name = []
p_index = []
if data and data["postData"] and data["postData"]["axises"] and data["postData"]["axises"][0]:
p_data = data["postData"]["datas"][0]["values"]
i = 0
for ax in data["postData"]["axises"][0]["values"]:
if data["tag"]!='undefined':
if data["tag"]=='预测年龄':
p_index.append(data["label"].lstrip('概览_')+":"+data["tag"].lstrip('预测')+":"+ax["showName"]+":::"+ax["key"])
else:
p_index.append(data["label"].lstrip('概览_')+":"+data["tag"]+":"+ax["showName"]+":::"+ax["key"])
else:
p_index.append(data["label"].lstrip('概览_')+":"+data["alias_tag"]+":"+ax["showName"]+":::"+ax["key"])
i += 1
else:
inter_err_info = packages[k]['package_name']+' :包数据异常'
break
count+=1
#聚类图里面区分每个点的依据
if font_name is '0':
# 二级标签的矩阵
obj = pd.DataFrame(p_data,index=p_index,columns=[packages[k]['package_name']])
else:
obj = pd.DataFrame(p_data,index=p_index,columns=[packages[k]['package_id']])
# print(obj)
temp.append(obj)
if inter_err_info !='':
error_info = inter_err_info
break
# 二级标签整合到一个集合中,做聚类的
resl = pd.concat(temp)
# print(resl.index.is_unique)
# 是否有重复的标签以防多圈
resl = resl.reset_index(drop=False)
#去除重复项
resl = resl.drop_duplicates('index')
# print(resl.index.is_unique)
resl = resl.set_index('index')
df_empty = pd.concat([df_empty,resl],axis=1)
if error_info !='':
data = {'status': 'error','msg': error_info}
json_data = serializer(data, output_type='json')
return HttpResponse(json_data, content_type="application/json")
df_empty = df_empty.reset_index(drop=False)
df_empty['index'] = df_empty['index'].map(lambda x: x.split(':::')[0])
df_empty.drop(df_empty[(df_empty['index'].str.contains(r'.*:其他'))|(df_empty['index'].str.contains(r'.*:未知'))|(df_empty['index'].str.contains(r'.*:香港'))|(df_empty['index'].str.contains(r'.*:台湾'))|(df_empty['index'].str.contains(r'.*:澳门'))|(df_empty['index'].str.contains(r'.*:理财专家'))].index,inplace=True)
df_empty = df_empty.set_index('index')
df_empty = df_empty.where(df_empty.notnull(), '0')
dfs = df_empty.T.astype('float64')
dataSet = dfs.as_matrix(columns=None)
print('dataSet-----:%s'%(dataSet))
#以上是对数据做处理,删除掉空值,以及一些其他。未知等数据
dataLine = dfs.index
print(dataLine)
print('---------------------------------')
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif']=['PingFang HK']
# plt.rcParams['axes.unicode_minus']=False
# 去除边框
ax = plt.gca() #具体特征
#因为这边做好的ax有四个轴,对其中的两个轴进行修饰。设置头部和右边框颜色为空
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ktest = KMeans(n_clusters=int(K_value))
try:
model = ktest.fit(dataSet)
y_pred=ktest.predict(dataSet)
print('y_pred------:%s'%(y_pred))
print('-------------------------------')
except Exception as e:
data = {'status': 'error', 'msg': '聚类异常,'+e}
json_data = serializer(data, output_type='json')
return HttpResponse(json_data, content_type="application/json")
# centers = model.cluster_centers_
# # 获取数据属于哪个质心,主成分分析。
pca = PCA(copy=True, n_components=3, whiten=False)
print('pca------:%s'%(pca))
#这里对数据做预处理,均值方差,归一化
lowDDataMat=pca.fit_transform(dataSet)
print('lowDDataMat----:%s'%(lowDDataMat))
#设置画图的分辨率,大小等信息
plt.rcParams['font.sans-serif'] = ['SimHei']
dfs_xy = pd.DataFrame(lowDDataMat,columns=['x','y','z'])
print('处理前的dfs_xy------:%s'%(dfs_xy))
dfs_xy['label'] = y_pred
dfs_xy['name'] = dataLine
print('dfs_xy*****')
print('处理后的dfs_xy------:%s' % (dfs_xy))
print('------------c---c----c-------------')
if font_name is '0':
packages_df = packages_df.rename(columns={'package_name':'name'})
new_dfs_xy = dfs_xy.merge(packages_df,how='left',on='name')
else:
packages_df = packages_df.rename(columns={'id':'name'})
new_dfs_xy = dfs_xy.merge(packages_df,how='left',on='name')
print(new_dfs_xy)
# new_dfs_xy = new_dfs_xy.fillna('0')
# new_dfs_xy['package_count'] = new_dfs_xy['package_count'].astype(int)
file_name=datetime.now().strftime("%Y%m%d%H%M%S")
new_dfs_xy[['x','y','z','label','name','package_count']].to_excel(os.path.join(UPLOAD_ROOT, file_name+'.xlsx'))
# 重新聚类无需去除 上面已删除 当选择包ID命名时
if check_loop_k is '0' and font_name is '1':
# id命名
print('去除包ID:')
new_dfs_xy['name'] = new_dfs_xy['name'].astype(str)
new_dfs_xy = new_dfs_xy[~(new_dfs_xy.name.isin(except_packages_list))]
if check_loop_k is '0' and font_name is '0':
#name命名
print('去除包名称:')
new_dfs_xy = new_dfs_xy[~(new_dfs_xy.name.isin(except_checked_packname_arr))]
if not x_min is None:
print('区间:')
new_dfs_xy = new_dfs_xy[(new_dfs_xy.x>=float(x_min))&(new_dfs_xy.x<=float(x_max))&(new_dfs_xy.y>=float(y_min))&(new_dfs_xy.y<=float(y_max))]
# plt.scatter(lowDDataMat[:,0],lowDDataMat[:,1],c=y_pred,cmap=plt.cm.Paired,s=point_size_diy)
# s 参数可以传入数组
# area=(30*np.random.rand(10))**2
# plt.scatter(x,y,s=area,c=colors,alpha=0.5)
# plt.scatter(new_dfs_xy.x.values,new_dfs_xy.y.values,c=new_dfs_xy.label.values,cmap=plt.cm.Paired,s=new_dfs_xy.package_count.values/10000)
plt.scatter(new_dfs_xy.x.values,new_dfs_xy.y.values,c=new_dfs_xy.label.values,cmap=plt.cm.Paired,s=point_size_diy)
from matplotlib.font_manager import FontProperties
#定义自定义字体,文件名从1.b查看系统中文字体中来
myfont = FontProperties(fname='/usr/share/fonts/SimHei/msyh.ttf')
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus']=False
plt.rc('font',family='Microsoft YaHei',size=font_size_diy,weight='bold')
i=0
for line in new_dfs_xy.name.values:
# x=lowDDataMat[i][0]
# y=lowDDataMat[i][1]
x=new_dfs_xy.x.values[i]
y=new_dfs_xy.y.values[i]
if(line!='.'):
plt.annotate(line, xy=(x, y), xytext=(x, y),color=font_color_diy)
i += 1
plt.savefig(os.path.join(UPLOAD_ROOT, file_name+'.png'),dpi=1000,facecolor='w', transparent=True)
plt.close()
data = {'status': 'success','file_name':file_name+'.png', 'xy':file_name,'msg': 'ok!'}
json_data = serializer(data, output_type='json')
return HttpResponse(json_data, content_type="application/json")
k-means:无监督聚类算法,大家可以具体搜一下无监督和有监督的区别,重要的是设定一个k值,这个k代表你要把数据归位k类,符合其中的哪一个的就聚集在那个点附近。内部算法大家参考我在下面贴出的链接。
PCA:主成分分析,浅层意思就是抽取主要特征进行分析,这个过程也就是所谓的降维,那么怎么去知道那些特征是主要的。那些特征是可以剔除的呢,这里就有一套内部计算算法,指出怎么确定重要特征。当然要结合我在下方贴的链接,大家就明白了。
我自己的一些见解感觉太浅显,这里贡献出我看到特别好的两篇文章
k-means:https://www.cnblogs.com/zy230530/p/7029025.html
PCA:https://blog.csdn.net/qq_39422642/article/details/78821812