k-means算法与pca结合对多特征组聚类

数据:6个画像,每个画像中有700个标签。
解释:同一标签下不同画像对应的数据,我要对此数据做聚类,

结果展示:
k-means算法与pca结合对多特征组聚类_第1张图片
后端代码显示

def kmeans_img(request):
	#获取前段数据
	UPLOAD_ROOT=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static/img')
	brand_name = request.POST.get('brand_name', '').strip()
	package_name_arr = request.POST.getlist('package_name_arr[]', [])
	label_lists = request.POST.getlist('label_lists[]', [])
	font_name =request.POST.get('font_name', '0').strip()
	K_value =request.POST.get('K_value', '2').strip()

	#局部放大后设置每个轴的最大最小值
	x_min =request.POST.get('x_min', None)
	x_max =request.POST.get('x_max', None)
	y_min =request.POST.get('y_min', None)
	y_max =request.POST.get('y_max', None)

	font_size_diy=int(request.POST.get('font_size_diy')) if request.POST.get('font_size_diy') else 7
	font_color_diy=request.POST.get('font_color_diy') if request.POST.get('font_color_diy') else '000000'
	font_color_diy='#'+font_color_diy
	point_size_diy=int(request.POST.get('point_size_diy')) if request.POST.get('point_size_diy') else 5


	# 挖除的包的id列表
	except_packages_list = request.POST.getlist('except_packages_list[]', [])
	# 挖除的包的name列表
	except_checked_packname_arr = request.POST.getlist('except_checked_packname_arr[]', [])

	# 1为重新聚类 0为挖除点
	check_loop_k =request.POST.get('check_loop_k', '0').strip()
	if check_loop_k is '1':
		package_name_arr = list(set(package_name_arr).difference(set(except_packages_list)))

	packages =CrowdPerspective.objects.filter(brand=brand_name,package_id__in=package_name_arr,label__in=label_lists).values("package_name","package_id","package_count").order_by("package_name").annotate(total_pages=Count("package_id")).all()
	print('*****')
	packages_df = pd.DataFrame(list(packages))
	
	print(packages_df)


	error_info = ''
	temp_sum = []
	df_empty = pd.DataFrame({})
	# 二级标签数量限制
	for k in range(len(packages)):
		if packages[k]['package_id'] == 0:
			error_info =packages[k]['package_name']+' :人群包缺少包ID异常'
			break
		Cp = CrowdPerspective.objects.filter(package_id=packages[k]['package_id'],label__in=label_lists).all().values()


		temp = []

		count = 0
		inter_err_info = ''
		for data in Cp:
			p_data = []
			p_show_name = []
			p_index = []
			if data and data["postData"] and data["postData"]["axises"] and data["postData"]["axises"][0]:
				p_data = data["postData"]["datas"][0]["values"]
				i = 0
				for ax in data["postData"]["axises"][0]["values"]:
					if data["tag"]!='undefined':
						if data["tag"]=='预测年龄':
							p_index.append(data["label"].lstrip('概览_')+":"+data["tag"].lstrip('预测')+":"+ax["showName"]+":::"+ax["key"])
						else:
							p_index.append(data["label"].lstrip('概览_')+":"+data["tag"]+":"+ax["showName"]+":::"+ax["key"])
					else:
						p_index.append(data["label"].lstrip('概览_')+":"+data["alias_tag"]+":"+ax["showName"]+":::"+ax["key"])
					i += 1
			else:
				inter_err_info = packages[k]['package_name']+' :包数据异常'
				break
			count+=1

			#聚类图里面区分每个点的依据
			if font_name is '0':
				# 二级标签的矩阵
				obj = pd.DataFrame(p_data,index=p_index,columns=[packages[k]['package_name']])
			else:
				obj = pd.DataFrame(p_data,index=p_index,columns=[packages[k]['package_id']])
			# print(obj)
			temp.append(obj)
		if inter_err_info !='':
			error_info = inter_err_info
			break


		# 二级标签整合到一个集合中,做聚类的
		resl = pd.concat(temp)
		# print(resl.index.is_unique)
		# 是否有重复的标签以防多圈
		resl = resl.reset_index(drop=False)
		#去除重复项
		resl = resl.drop_duplicates('index')
		# print(resl.index.is_unique)
		resl = resl.set_index('index')

		df_empty =  pd.concat([df_empty,resl],axis=1)

	if error_info !='':
		data = {'status': 'error','msg': error_info}
		json_data = serializer(data, output_type='json')
		return HttpResponse(json_data, content_type="application/json")
	

	df_empty = df_empty.reset_index(drop=False)
	df_empty['index'] = df_empty['index'].map(lambda x: x.split(':::')[0])
	df_empty.drop(df_empty[(df_empty['index'].str.contains(r'.*:其他'))|(df_empty['index'].str.contains(r'.*:未知'))|(df_empty['index'].str.contains(r'.*:香港'))|(df_empty['index'].str.contains(r'.*:台湾'))|(df_empty['index'].str.contains(r'.*:澳门'))|(df_empty['index'].str.contains(r'.*:理财专家'))].index,inplace=True)
	df_empty = df_empty.set_index('index')
	df_empty = df_empty.where(df_empty.notnull(), '0')
	dfs = df_empty.T.astype('float64')
	dataSet = dfs.as_matrix(columns=None)

	print('dataSet-----:%s'%(dataSet))
	#以上是对数据做处理,删除掉空值,以及一些其他。未知等数据


	dataLine = dfs.index
	print(dataLine)
	print('---------------------------------')
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	# plt.rcParams['font.sans-serif']=['PingFang HK'] 
	# plt.rcParams['axes.unicode_minus']=False

	# 去除边框
	ax = plt.gca() #具体特征

    #因为这边做好的ax有四个轴,对其中的两个轴进行修饰。设置头部和右边框颜色为空

	ax.spines['top'].set_color('none')
	ax.spines['right'].set_color('none')


	ktest = KMeans(n_clusters=int(K_value))

	try:
		model = ktest.fit(dataSet)
		y_pred=ktest.predict(dataSet)

		print('y_pred------:%s'%(y_pred))
		print('-------------------------------')

	except Exception as e:
		data = {'status': 'error', 'msg': '聚类异常,'+e}
		json_data = serializer(data, output_type='json')
		return HttpResponse(json_data, content_type="application/json")
	# centers = model.cluster_centers_

	# # 获取数据属于哪个质心,主成分分析。
	pca = PCA(copy=True, n_components=3, whiten=False)

	print('pca------:%s'%(pca))

	#这里对数据做预处理,均值方差,归一化
	lowDDataMat=pca.fit_transform(dataSet)
	print('lowDDataMat----:%s'%(lowDDataMat))

	#设置画图的分辨率,大小等信息
	plt.rcParams['font.sans-serif'] = ['SimHei']

	dfs_xy = pd.DataFrame(lowDDataMat,columns=['x','y','z'])
	print('处理前的dfs_xy------:%s'%(dfs_xy))
	dfs_xy['label'] = y_pred
	dfs_xy['name'] = dataLine
	print('dfs_xy*****')
	print('处理后的dfs_xy------:%s' % (dfs_xy))
	print('------------c---c----c-------------')
	if font_name is '0':
		packages_df = packages_df.rename(columns={'package_name':'name'})
		new_dfs_xy = dfs_xy.merge(packages_df,how='left',on='name')
	else:
		packages_df = packages_df.rename(columns={'id':'name'})
		new_dfs_xy = dfs_xy.merge(packages_df,how='left',on='name')
	print(new_dfs_xy)
	# new_dfs_xy = new_dfs_xy.fillna('0')
	# new_dfs_xy['package_count'] = new_dfs_xy['package_count'].astype(int)

	file_name=datetime.now().strftime("%Y%m%d%H%M%S")
	new_dfs_xy[['x','y','z','label','name','package_count']].to_excel(os.path.join(UPLOAD_ROOT, file_name+'.xlsx'))
	# 重新聚类无需去除  上面已删除 当选择包ID命名时
	if check_loop_k is '0' and font_name is '1':
		# id命名
		print('去除包ID:')
		new_dfs_xy['name'] = new_dfs_xy['name'].astype(str)
		new_dfs_xy = new_dfs_xy[~(new_dfs_xy.name.isin(except_packages_list))]
	if check_loop_k is '0' and font_name is '0':
		#name命名
		print('去除包名称:')
		new_dfs_xy = new_dfs_xy[~(new_dfs_xy.name.isin(except_checked_packname_arr))]

	if not x_min is None:
		print('区间:')
		new_dfs_xy = new_dfs_xy[(new_dfs_xy.x>=float(x_min))&(new_dfs_xy.x<=float(x_max))&(new_dfs_xy.y>=float(y_min))&(new_dfs_xy.y<=float(y_max))]
	# plt.scatter(lowDDataMat[:,0],lowDDataMat[:,1],c=y_pred,cmap=plt.cm.Paired,s=point_size_diy)
	
	# s 参数可以传入数组
	# area=(30*np.random.rand(10))**2 
	# plt.scatter(x,y,s=area,c=colors,alpha=0.5)
	# plt.scatter(new_dfs_xy.x.values,new_dfs_xy.y.values,c=new_dfs_xy.label.values,cmap=plt.cm.Paired,s=new_dfs_xy.package_count.values/10000)
	plt.scatter(new_dfs_xy.x.values,new_dfs_xy.y.values,c=new_dfs_xy.label.values,cmap=plt.cm.Paired,s=point_size_diy)
	

	from matplotlib.font_manager import FontProperties

	#定义自定义字体,文件名从1.b查看系统中文字体中来
	myfont = FontProperties(fname='/usr/share/fonts/SimHei/msyh.ttf')
	#解决负号'-'显示为方块的问题
	matplotlib.rcParams['axes.unicode_minus']=False
	plt.rc('font',family='Microsoft YaHei',size=font_size_diy,weight='bold')
	i=0
	for line in new_dfs_xy.name.values:
		# x=lowDDataMat[i][0]
		# y=lowDDataMat[i][1]
		x=new_dfs_xy.x.values[i]
		y=new_dfs_xy.y.values[i]
		if(line!='.'):
			plt.annotate(line, xy=(x, y), xytext=(x, y),color=font_color_diy)
		i += 1

	
	plt.savefig(os.path.join(UPLOAD_ROOT, file_name+'.png'),dpi=1000,facecolor='w', transparent=True)
	plt.close()
	data = {'status': 'success','file_name':file_name+'.png', 'xy':file_name,'msg': 'ok!'}
	json_data = serializer(data, output_type='json')
	return HttpResponse(json_data, content_type="application/json")

k-means:无监督聚类算法,大家可以具体搜一下无监督和有监督的区别,重要的是设定一个k值,这个k代表你要把数据归位k类,符合其中的哪一个的就聚集在那个点附近。内部算法大家参考我在下面贴出的链接。
PCA:主成分分析,浅层意思就是抽取主要特征进行分析,这个过程也就是所谓的降维,那么怎么去知道那些特征是主要的。那些特征是可以剔除的呢,这里就有一套内部计算算法,指出怎么确定重要特征。当然要结合我在下方贴的链接,大家就明白了。
k-means算法与pca结合对多特征组聚类_第2张图片

我自己的一些见解感觉太浅显,这里贡献出我看到特别好的两篇文章
k-means:https://www.cnblogs.com/zy230530/p/7029025.html
PCA:https://blog.csdn.net/qq_39422642/article/details/78821812

你可能感兴趣的:(原创文章,机器学习,pca,k-means)