MySQL二手车数据清洗及特征处理

1. 数据来源:瓜子二手车爬虫(数据部分截图如下)

MySQL二手车数据清洗及特征处理_第1张图片

数据集字段含义:

  • Brand:品牌
  • Name:型号
  • Boarding_time:上牌时间
  • Km:里程数
  • Discharge:排放标准
  • Sec_price:二手车价格
  • New_price:新车价格

2、缺失值的查找和处理

SELECT
	count(*)
FROM
	used_car_analysis
WHERE
	Brand IS NULL
OR Name IS NULL
OR Boarding_time IS NULL
OR Km IS NULL
OR Discharge IS NULL
OR Sec_price IS NULL
OR New_price IS NULL;

3、数据类型不一致的处理

3、1 Brand

查看“品牌”列,无异常值。合计104种品牌。

SELECT Brand, count(*) from used_car_analysis
group by Brand;

SELECT count(*) from (SELECT Brand, count(*) from used_car_analysis
group by Brand) c;

3、2 Name

“型号”列比较复杂,等下会进行特征提取。

3、3 Bording_time

“上牌时间”中有93个“暂无”数据。全部修改为当前年份和月份。

SELECT
	Boarding_time,
	count(*)
FROM
	used_car_analysis
WHERE
	Boarding_time NOT LIKE "%年%月";

UPDATE used_car_analysis
SET Boarding_time = "2020年6月"
WHERE
	Boarding_time = "未上牌";

3、4 Km

对于“里程数”列,存在部分数据为“百公里内”。将这些数据全部替换为0.01(即0.01万公里);其他数据删除单位“万公里”

SELECT
	*
FROM
	used_car_analysis
WHERE
	Km NOT LIKE "%万%"
GROUP BY
	Km;

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		(
			CASE
			WHEN Km LIKE "%万公里" THEN
				LEFT (Km, LOCATE("万", Km) - 1)
			ELSE
				0.01
			END
		) Km
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Km = b.Km;

 

3、5 Discharge

对此列不做处理。本次分析不会用到这一列。

3、6 Sec_price

查看是否有0。结果:无。

3、7 New_price

查看是否有不同于其他数据的数据。

SELECT
	*
FROM
	used_car_analysis
WHERE
	New_price NOT LIKE "%万"

part 2. 特征处理

1、车龄

根据Boarding_time,抽取上牌年份和月份,计算出距离当前时间的月份数,即车龄。

ALTER TABLE used_car_analysis ADD COLUMN Boarding_year INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		LEFT (
			Boarding_time,
			LOCATE("年", Boarding_time) - 1
		) Boarding_year
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_year = b.Boarding_year 
#提取上牌年份

ALTER TABLE used_car_analysis ADD COLUMN Boarding_month INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING(
			Boarding_time
			FROM
				LOCATE("年", Boarding_time) + 1 FOR (
					LOCATE("月", Boarding_time) - 1 - LOCATE("年", Boarding_time)
				)
		) Boarding_month
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Boarding_month = b.Boarding_month 
#提取上牌月份

ALTER TABLE used_car_analysis ADD COLUMN time INT 

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		(
			(
				YEAR (CURDATE()) - Boarding_year
			) * 12 + MONTH (CURDATE()) - Boarding_month
		) time
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.time = b.time 
#计算车龄

ALTER TABLE used_car_analysis DROP COLUMN Boarding_month 
ALTER TABLE used_car_analysis DROP COLUMN Boarding_year 
#删除无用的列

2、折旧率(年限平均折旧法)

ALTER TABLE used_car_analysis ADD COLUMN Depreciation FLOAT;

UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case 
WHEN time !=0 then ((New_price - Sec_price)*12/time)/New_price  
when time = 0 then ((New_price - Sec_price)/(time+1))/New_price 
end) Depreciation
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Depreciation= b.Depreciation ;


3、折旧率(按照行驶里程数折旧)

ALTER TABLE used_car_analysis ADD COLUMN Dep_Distance FLOAT 
UPDATE used_car_analysis a
INNER JOIN (
SELECT Name,
(case 
WHEN Km !=0 then ((New_price - Sec_price)*10000/Km)/New_price
ELSE 0
end) Dep_Distance
from used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.Dep_Distance = b.Dep_Distance 

4、款式

从Name中提取二手车的款式。

select count(*) from used_car_analysis
WHERE Name like "%款%";
#查看是否每个Name中是否都含有“款”字,为之后的款式字符抽取做准备。(结果表明每列都包含有“款”字)

ALTER TABLE used_car_analysis ADD COLUMN Style INT;

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		Name,right(style,4) Style from (SELECT Name,SUBSTRING_INDEX(Name,'款',1) style FROM used_car_analysis) c
) AS b ON a. NAME = b. NAME
SET a.Style = b.Style ;
#抽取每款车的款式

5、型号

从Name中提取二手车的型号。

ALTER TABLE used_car_analysis ADD COLUMN model CHAR(255);

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING_INDEX(model, "20", 1) model
	FROM
		used_car_analysis
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#提取每款车的型号。

UPDATE used_car_analysis a
INNER JOIN (
	SELECT
		NAME,
		SUBSTRING_INDEX(model, " ", 1) model
	FROM
		(
			SELECT
				NAME,
				Style,
				model
			FROM
				used_car_analysis
			WHERE
				Style NOT LIKE "20%"
		) c
) AS b ON a. NAME = b. NAME
SET a.model = b.model;
#对于2000年以前的型号,另做更新。


(为方便“二八”法则的分析,特提取下列数据信息)

6、各品牌销售量累计求和

#对各品牌的销量累计求和
SET @csum := 0;
SELECT
	Brand,
	sale,
	(@csum := @csum + sale) AS cum_sale
FROM
	(
		SELECT
			Brand,
			count(*) sale
		FROM
			used_car_analysis
		GROUP BY
			Brand
		ORDER BY
			sale DESC
	) c;

7、各品牌销售额累计求和

SET @csum := 0;
SELECT
	Brand, 
  sale_money,
	(@csum := @csum + sale_money)  cum_sale_money 
FROM (
select Brand,
  sum(if(Brand = Brand,Sec_price,0)) sale_money
from used_car_analysis
group by Brand 
ORDER by sale_money DESC
) c;

 

你可能感兴趣的:(python数据分析)