解决爬虫数据(电影院比价系统)电影院名称不规范问题解决思路

1 问题

各大网站录入电影院,地址没有统一的规范,造成电影票无法比价。

2 解决思路

2.1 经纬度范围查找

拿到数据中包含经度维度信息,根据经纬度范围查找锁定这些名字不同的电影院为同一家电影院。

2.1.1 各大网站使用的地图坐标协议不同

(google、高德、腾讯、图吧地图、图吧导航)使用的是gcj02,百度、搜狗使用的是另外一种坐标协议bd09。所以网上找个java写的统一转换各大地图协议至百度地图的代码,然后改写为mysql的自定义函数,转换后误差在万分之五(距离大概是5-5.5米)

一、经纬度距离换算

a)在纬度相等的情况下:

经度每隔0.00001度,距离相差约1米;
每隔0.0001度,距离相差约10米;
每隔0.001度,距离相差约100米;
每隔0.01度,距离相差约1000米;
每隔0.1度,距离相差约10000米。

b)在经度相等的情况下:

纬度每隔0.00001度,距离相差约1.1米;
每隔0.0001度,距离相差约11米;
每隔0.001度,距离相差约111米;
每隔0.01度,距离相差约1113米;
每隔0.1度,距离相差约11132米。

高德 convert to 百度经纬度函数

(网上java有现成代码,这是根据java改写mysql代码)。
各个地图经纬度转换

转换维度

DELIMITER |

CREATE FUNCTION convert_gcj02_to_bd09_lat(longitude DOUBLE(9,6),latitude DOUBLE(9,6))

RETURNS DOUBLE(9,6)

BEGIN

DECLARE x_pi DOUBLE(9,8);

DECLARE x DOUBLE(9,6);

DECLARE y DOUBLE(9,6);

DECLARE z DOUBLE(9,6);

DECLARE theta DOUBLE(10,9);

SET x_pi = 3.14159265358979324 * 3000.0 / 180.0;

SET x=longitude;

SET y=latitude;

SET z=sqrt(x*x+y*y)+ 0.00002 * sin(y*x_pi);

SET theta=atan2(y,x)+ 0.000003 * cos(x*x_pi);

SET longitude=z*cos(theta)+0.0065;

SET latitude=z*sin(theta)+0.006;

RETURN latitude;

END |

DELIMITER ;

测试

SELECT convert_gcj02_to_bd09_lat(120.098703,29.324483);

转换经度

DELIMITER |

CREATE FUNCTION convert_gcj02_to_bd09_lng(longitude DOUBLE(9,6),latitude DOUBLE(9,6))

RETURNS DOUBLE(9,6)

BEGIN

DECLARE x_pi DOUBLE(9,8);

DECLARE x DOUBLE(9,6);

DECLARE y DOUBLE(9,6);

DECLARE z DOUBLE(9,6);

DECLARE theta DOUBLE(10,9);

SET x_pi = 3.14159265358979324 * 3000.0 / 180.0;

SET x=longitude;

SET y=latitude;

SET z=sqrt(x * x + y * y) + 0.00002 * sin(y * x_pi);

SET theta = atan2(y, x) + 0.000003 * cos(x * x_pi);

SET longitude = z * cos(theta) + 0.0065;

RETURN longitude;

END |

DELIMITER ;

测试

SELECT convert_gcj02_to_bd09_lng(120.098703,29.324483);

根据经纬度计算距离函数

DELIMITER |

CREATE FUNCTION `juli`(lat1 DOUBLE(10,7),lat2 DOUBLE(10,7),lng1 DOUBLE(10,7),lng2 DOUBLE(10,7)) RETURNS double

BEGIN

SET @distance=round(6378.138*2*asin(sqrt(pow(sin( (lat1*pi()/180-lat2*pi()/180)/2),2)+cos(lat1*pi()/180)*cos(lat2*pi()/180)* pow(sin( (lng1*pi()/180-lng2*pi()/180)/2),2)))*1000);

RETURN @distance;

END |

DELIMITER ;

弃用经纬度算法

很多影院的经纬度信息为null,而且有些经纬度信息不太准确,所以后面弃用了根据经纬度去定位是否为同一家影院。

根据电影院名字,电话,地址的相识度匹配。

公式如下

count(相识单词之间A和B)/count(A)+count(B)-count(交集))

代码如下:

电影院名称相识度匹配

对比两个字符串

DELIMITER ;;

CREATE FUNCTION `levenshtein`( s1 TEXT, s2 TEXT) RETURNS INT(11)

DETERMINISTIC

BEGIN

DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;

DECLARE s1_char CHAR;

DECLARE cv0, cv1 TEXT;

SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0;

IF s1 = s2 THEN

RETURN 0;

ELSEIF s1_len = 0 THEN

RETURN s2_len;

ELSEIF s2_len = 0 THEN

RETURN s1_len;

ELSE

WHILE j <= s2_len DO

SET cv1 = CONCAT(cv1, UNHEX(HEX(j)));

SET j = j + 1;

END WHILE;

WHILE i <= s1_len DO

SET s1_char = SUBSTRING(s1, i, 1);

SET c = i;

SET cv0 = UNHEX(HEX(i));

SET j = 1;

WHILE j <= s2_len DO

SET c = c + 1;

IF s1_char = SUBSTRING(s2, j, 1) THEN

SET cost = 0;

ELSE SET cost = 1;

END IF;

SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost;

IF c > c_temp THEN SET c = c_temp; END IF;

SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1;

IF c > c_temp THEN

SET c = c_temp;

END IF;

SET cv0 = CONCAT(cv0, UNHEX(HEX(c)));

SET j = j + 1;

END WHILE;

SET cv1 = cv0;

SET i = i + 1;

END WHILE;

END IF;

RETURN c;

END ;;

DELIMITER ;

两个字符串相识度占比

DELIMITER ;;

CREATE FUNCTION `levenshtein_ratio`( s1 TEXT, s2 TEXT ) RETURNS INT(11)

DETERMINISTIC

BEGIN

DECLARE s1_len, s2_len, max_len INT;

SET s1_len = LENGTH(s1), s2_len = LENGTH(s2);

IF s1_len > s2_len THEN

SET max_len = s1_len;

ELSE

SET max_len = s2_len;

END IF;

RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100);

END |

DELIMITER ;;

通过几次测试,相识度大于等于90的大致为同一影院。个别电影院名字极度相仿的,可以对相识度值做一些调整。

SELECT *,levenshtein_ratio('龙海金逸影城(美一店)',cinema_name) xiangshi FROM `bidding_cinema_data`

WHERE levenshtein_ratio('龙海金逸影城(美一店)',cinema_name)>=90;

重回经纬度

字符串匹配的精确度很难达到80以上(因为有的电影院名字很短,只有两个字或4个字)
所以这些电影院相识度匹配的时候,很难区分…

问题

采集到的数据,有的经纬度信息为null
所以根据百度地图接口传入地址来补全经纬度信息.

根据经纬度范围打标签

SQL脚本如下:
打标签第一版本

DELIMITER ;;

CREATE PROCEDURE `set_lable`(lng DOUBLE,lat DOUBLE,rounds DOUBLE,city_meta_id int,lables int)

-- lng:维度

-- lat:经度

-- rounds:前后范围

-- city_meta_id:城市编号

-- labels:标签

BEGIN

set @lng=lng;

set @lat=lat;

set @rounds=rounds;

set @lable=lables;

set @city_meta_id=city_meta_id;

update clean_cinema_data_copy as a

inner join bidding_city_data as b

on a.city_id=b.city_id

and a.site_id=b.site_id

SET lable=@lable,

brand=replace(replace(replace(replace(replace(replace(replace(replace(cinema_name,'电影院',' '),'电影城',' '),'影视城',' '),'国际',''),'影院',' '),'影城',' '),'影视',' '),city_name,' ')

where longitude<>0.0

and city_meta_id=@city_meta_id

and latitude>= @lat-@rounds and latitude<@lat+@rounds

and longitude>= @lng-@rounds and longitude<@lng+@rounds

and lable is NULL;

END;;

DELIMITER ;

调用上面过程的脚本

批量打标签第一版本

set @rownum=0;

select

concat('call set_lable(',longitude,',',latitude,',',0.006,',',3120,',',@rownum:=@rownum+1,');')

from clean_cinema_data_copy

where city_meta_id=3120

and longitude<>0.0;

-- 把此语句执行的结果复制到连接数据库的IDE里执行

根据经纬度范围打标签结果

最终的准确度在65%-75% 之间, 距离最终90%还有一定距离.

所以后面会加上一些brand的词库. 根据经纬度范围打过标签之后再根据brand这个维度再打一次.

上海市

千分之六:大于等于4家的是128个, 等于4家的是91个;

千分之三:大于等于4家的是139个, 等于4家的是113个;

千分之二:大于等于4家的是142个, 等于4家的是125个; SELECT 125*1.0/165 准确度:0.75758;

北京市

千分之六:大于等于4家的是110个, 等于4家的是83个; select 83*1.0/136 0.61029;

千分之三:大于等于4家的是107个, 等于4家的是92个; select 92*1.0/136 准确度: 0.67647;

千分之二:大于等于4家的是99个, 等于4家的是89个; SELECT 89*1.0/136 0.65441;

广州市

千分之六:大于等于4家的是78个, 等于4家的是58个; select 58*1.0/105 0.55238;

千分之三:大于等于4家的是79个, 等于4家的是68个; select 68*1.0/105 准确度:0.64762;

千分之二:大于等于4家的是76个, 等于4家的是66个; SELECT 66*1.0/105 0.62857;

根据经纬度范围和词库brand两个维度打标签的准确率

思路
根据两个经纬度打标签,打完标签, 本来5个的6个的7个的可能会分出来1,2,3条,
再加上一个维度打标签,完全为4个电影院的准确率为
上海市两个维度打标签 0.7879 上海由原来的0.75758 提升为0.75758
北京市两个维度打标签 0.7353 北京由原来的0.67647提升为0.7353
广州市两个维度打标签 0.6762 广州由原来的0.64762提升为0.6762
准确率还是不太高……

加维度

逻辑如下:
首先根据经纬度的范围打一次标签(把范围在200 米内的并且有4家[4个网站] 算为一个电影院) {结果集1}
再把范围200米内不为4家(有的比较密集,8家,12家)加上简单的品牌分词, 按 机器标签, 品牌分组 等于4个的集合 {结果集2}
再把城市所有数据 跟上面两个结果集的数据的交集求并集{结果集3}
贪心算法 应用到结果集合3, 从500米开始步长循环处理每次递减50米…把完全等于4的集合放入临时表…(这里会产生9个临时表)
创建过程把 集合1 U distinct {临时表1 U 临时表2 U 临时表 U 临时表3 U 临时表4 U 临时表5 U 临时表6 U 临时表7 U 临时表8 U 临时表9}
取出来就是该城市最终数据, 上面公式中文解释 9个临时表取并集 去除重复后 跟集合1 取交集…

打标签代码如下

打标签第二版本

DELIMITER ;;

CREATE PROCEDURE `set_lable1`(tablename VARCHAR(48),lng DOUBLE,lat DOUBLE,rounds DOUBLE,city_meta_id INT)

BEGIN

DECLARE a INT DEFAULT 1;

SET @tablename=tablename;

SET @lng=lng;

SET @lat=lat;

SET @rounds=rounds;

SET @city_meta_id=city_meta_id;

SET @v_sql=CONCAT('SELECT ifnull(max(lable)+1,1) INTO @nums FROM ',@tablename);

PREPARE stmt FROM @v_sql;

EXECUTE stmt;

DEALLOCATE PREPARE stmt;

SET @v_sql=CONCAT('UPDATE ',@tablename,' AS a

INNER JOIN bidding_city_data AS b

ON a.city_id=b.city_id

AND a.site_id=b.site_id

SET lable=',@nums,',',

' brand=LEFT(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(cinema_name,"电影院",""),"电影城",""),"影视城",""),"国际",""),"影院",""),"影城",""),"影视",""),city_name,""),"(",""),"(",""),")",""),")",""),"影剧院",""),"-","")," ",""),3)

WHERE longitude<>0.0

AND city_meta_id=',@city_meta_id,'

AND latitude>=', @lat-@rounds, ' AND latitude< ',@lat+@rounds,

' AND longitude>=', @lng-@rounds,' AND longitude< ',@lng+@rounds,

' AND lable IS NULL;');

PREPARE stmt FROM @v_sql;

EXECUTE stmt;

DEALLOCATE PREPARE stmt;

END;;

DELIMITER ;

批量打标签脚本

批量打标签第二版本

delimiter |

CREATE PROCEDURE batch_set_lable1(city INT,rounds DOUBLE,groups INT)

BEGIN

DECLARE done INT DEFAULT -1;

DECLARE lng DOUBLE;

DECLARE lat DOUBLE;

DECLARE cur CURSOR FOR SELECT longitude,latitude FROM clean_cinema_data_copy WHERE city_meta_id=city AND longitude<>0.0;

DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=1;

OPEN cur;

read_loop:LOOP

FETCH cur INTO lng,lat;

IF done=1 THEN

LEAVE read_loop;

END IF;

CALL set_lable1('clean_cinema_data_copy',lng,lat,rounds,city);

END LOOP;

CLOSE cur;

-- 根据经纬度范围0.002打标签和

-- 每组不等于4个的数据再根据品牌分组等于4的集合,

-- 此集合与上海市的全部数据的差集,

-- 是我们后续需要缩小经纬度分析的集合

DROP TABLE IF EXISTS step_5_0;

CREATE TABLE step_5_0 AS

SELECT * FROM

(

SELECT a.* FROM clean_cinema_data_copy AS a

INNER JOIN

(

SELECT * FROM clean_cinema_data_copy

WHERE city_meta_id=city

GROUP BY `lable`

HAVING count(1)=groups

) AS b

ON a.`lable`=b.`lable`

AND a.city_meta_id=city

UNION

SELECT c.* FROM clean_cinema_data_copy AS c INNER JOIN

(

SELECT * FROM

(

SELECT a.* FROM clean_cinema_data_copy a

INNER JOIN

(

SELECT

lable,

id,

longitude,

latitude,

cinema_name,

cinema_meta_id,

brand,

COUNT(DISTINCT cinema_meta_id) cinemas,

COUNT(1) counts

FROM clean_cinema_data_copy

WHERE city_meta_id=city

AND longitude<>0

GROUP BY lable

HAVING count(1)<>groups

)b

ON a.lable=b.lable

AND a.brand=b.`brand`

GROUP BY lable,brand

HAVING count(1)=groups

)tb

GROUP BY lable,brand

)d

ON c.lable=d.`lable`

AND c.brand=d.brand

AND c.city_meta_id

)h;

处理500米内的数据

SET @rownum=0;

DROP TABLE IF EXISTS tmp_5_0;

CREATE TABLE tmp_5_0

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_5_0 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_5_0;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_5_0 WHERE orders=@loopstart;

CALL set_lable1('tmp_5_0',lng,lat,0.005,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理450米内的数据

DROP TABLE IF EXISTS tmp_4_5;

CREATE TABLE tmp_4_5

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_4_5 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_4_5;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_4_5 WHERE orders=@loopstart;

CALL set_lable1('tmp_4_5',lng,lat,0.0045,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理400米内的数据

DROP TABLE IF EXISTS tmp_4_0;

CREATE TABLE tmp_4_0

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_4_0 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_4_0;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_4_0 WHERE orders=@loopstart;

CALL set_lable1('tmp_4_0',lng,lat,0.004,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理350米内的数据

DROP TABLE IF EXISTS tmp_3_5;

CREATE TABLE tmp_3_5

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_3_5 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_3_5;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_3_5 WHERE orders=@loopstart;

CALL set_lable1('tmp_3_5',lng,lat,0.0035,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理300米内的数据

DROP TABLE IF EXISTS tmp_3_0;

CREATE TABLE tmp_3_0

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_3_0 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_3_0;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_3_0 WHERE orders=@loopstart;

CALL set_lable1('tmp_3_0',lng,lat,0.003,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理250米内的数据

DROP TABLE IF EXISTS tmp_2_5;

CREATE TABLE tmp_2_5

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_2_5 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_2_5;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_2_5 WHERE orders=@loopstart;

CALL set_lable1('tmp_2_5',lng,lat,0.005,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理200米内的数据

DROP TABLE IF EXISTS tmp_2_0;

CREATE TABLE tmp_2_0

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_2_0 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_2_0;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_2_0 WHERE orders=@loopstart;

CALL set_lable1('tmp_2_0',lng,lat,0.002,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理150米内的数据

DROP TABLE IF EXISTS tmp_1_5;

CREATE TABLE tmp_1_5

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_1_5 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_1_5;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_1_5 WHERE orders=@loopstart;

CALL set_lable1('tmp_1_5',lng,lat,0.0015,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理100米内的数据

DROP TABLE IF EXISTS tmp_1_0;

CREATE TABLE tmp_1_0

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_1_0 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_1_0;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_1_0 WHERE orders=@loopstart;

CALL set_lable1('tmp_1_0',lng,lat,0.001,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

处理50米内的数据

DROP TABLE IF EXISTS tmp_0_5;

CREATE TABLE tmp_0_5

SELECT *,@rownum:=@rownum+1 orders FROM clean_cinema_data_copy

WHERE `city_meta_id`=city

AND (longitude<>0.0 OR latitude<>0.0)

AND id NOT IN

(

SELECT id FROM step_5_0

);

UPDATE tmp_0_5 SET lable=NULL;

SET @loopstart=1;

SELECT @loopend:=max(orders) FROM tmp_0_5;

WHILE @loopstart<=@loopend DO

BEGIN

SELECT longitude,latitude INTO lng,lat FROM tmp_0_5 WHERE orders=@loopstart;

CALL set_lable1('tmp_0_5',lng,lat,0.001,city);

SET @loopstart=@loopstart+1;

END;

END WHILE;

END |

DELIMITER ;

获取最终结果的过程

获取结果的过程

delimiter |

CREATE PROCEDURE cinema_result(groups INT)

begin

SELECT id,cinema_id,agent_id,cinema_name,area,addr,`area_name`,

tele,longitude,latitude,cinema_brand,url,score,service,city_id,site_id,

STATUS,`cinema_meta_id`,`unique_name`,concat('step_',lable) lable FROM step_5_0

UNION

SELECT * FROM

(

SELECT DISTINCT id,cinema_id,agent_id,cinema_name,area,addr,`area_name`,

tele,longitude,latitude,cinema_brand,url,score,service,city_id,site_id,

STATUS,`cinema_meta_id`,`unique_name`,lable FROM

(

SELECT * FROM tmp_5_0 WHERE lable IN

(

SELECT lable FROM tmp_5_0

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_4_5 WHERE lable IN

(

SELECT lable FROM tmp_4_5

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_4_0 WHERE lable IN

(

SELECT lable FROM tmp_4_0

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_3_5 WHERE lable IN

(

SELECT lable FROM tmp_3_5

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_3_0

WHERE lable IN

(

SELECT lable FROM tmp_3_0

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_2_5

WHERE lable IN

(

SELECT lable FROM tmp_2_5

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_2_0

WHERE lable IN

(

SELECT lable FROM tmp_2_0

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_1_5

WHERE lable IN

(

SELECT lable FROM tmp_1_5

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_1_0

WHERE lable IN

(

SELECT lable FROM tmp_1_0

GROUP BY lable

HAVING count(1)=groups

)

UNION

SELECT * FROM tmp_0_5

WHERE lable IN

(

SELECT lable FROM tmp_0_5

GROUP BY lable

HAVING count(1)=groups

)

)tb

)tb1

GROUP BY id,cinema_id,agent_id,cinema_name,area,addr,`area_name`,

tele,longitude,latitude,cinema_brand,url,score,service,city_id,site_id,

STATUS,`cinema_meta_id`,`unique_name`

ORDER BY lable;

END |

delimiter ;

过程调用方法

两个过程的使用方法

/
第一个参数是城市编号,
第二个参数是第一次打标签使用的范围值(此处0.003或0.002能筛选出的数据最多).
第三个参数:4 跟爬去的站点数对应 
/
CALL batch_set_lable1(3120,0.002,4);

/
参数的含义是分几个站,
跟爬去的站点数量对应.
从表中拿出最终结果. 
/
CALL cinema_result(4);

用贪心算法得出北上广三个城市的正确率:

上海:86.67%
北京:85.29%
广州:78.09%

剩余的一些数据:需要人工比对…

根据贪心算法得出数据的准确率

(
此准确率是跟人工分组每组4个对比得出, 人工分组不等于4 小于4数据不完整,大于4此影院多拿一条数据
我暂时认为数据非法, 哪怕机器分组 3 个,5个的跟人工的完全一致 , 也视为非法
)

转载于:https://my.oschina.net/u/2603867/blog/1842583

你可能感兴趣的:(解决爬虫数据(电影院比价系统)电影院名称不规范问题解决思路)