本文是在之前两篇文章的基础上进行写作的
(1条消息) 【java爬虫】使用selenium爬取优惠券_haohulala的博客-CSDN博客
(1条消息) 【java爬虫】使用selenium获取某宝联盟淘口令_haohulala的博客-CSDN博客
前两篇文章介绍了如何获取优惠券的基础信息,本文将获取到的基本信息存到数据库中,并且利用数据库的排序获取优惠力度最大的优惠券。这里的优惠力度指的是用全前价格减去券后价格获得的优惠价格绝对值。
相对于之前的文章,本文中我优化了爬虫的爬取策略,把爬虫放到一个新的线程,一页一页不停地获取数据。在爬取的过程中我发现,程序会比较脆弱,很多情况下都会导致程序崩溃,比如某个按钮被遮挡住了,selenium就找不到了,这个时候就会抛异常。像这类问题只能后面慢慢优化了(如果有时间的话)。
首先来看一下爬虫线程运行的情况
我们需要存到数据库中的数据比较多,更新了实体类
@Data
@AllArgsConstructor
@NoArgsConstructor
public class GoodItem {
// 优惠券标题
private String title;
// 图片url
private String imgUrl;
// 券前价格
private Double prePrice;
// 券后价格
private Double postPrice;
// 佣金率
private Double commissionRate;
// 佣金
private Double commission;
// 口令
private String recommend;
// 创建的时间
private String serializeTime;
// 优惠价格
private Double preferentialPrice;
// 优惠率
private Double preferentialRate;
// 计算优惠价格和优惠率
public void calculatePreferentialPriceAndRate() {
if(prePrice==null || postPrice==null) {
preferentialPrice = 0.0;
preferentialRate = 0.0;
return;
}
preferentialPrice = prePrice - postPrice;
preferentialRate = preferentialPrice / prePrice;
// 保留四位小数
preferentialPrice = Double.parseDouble(String.format("%.4f", preferentialPrice));
preferentialRate = Double.parseDouble(String.format("%.4f", preferentialRate));
}
}
从上述代码中可以看出来,优惠价格和优惠率是自己计算的,在获取完券前价格和券后价格后就可以计算着两个值了,最后的结果保留四位有效数字,这里只是用了格式化保留四位小数,并没有进行四舍五入。
爬虫程序与之前的不同就是每当获取一个完整的数据后就存到数据库中,然后一页一页不停地获取数据。
@Slf4j
@Service
public class SeleinumServiceImpl implements SeleinumService {
private final String DRIVER_PATH = "E:/写作/优惠券项目/驱动/chromedriver.exe";
@Override
public void startSelenium() {
// 实例化BrowserMob代理
System.setProperty("webdriver.chrome.driver", DRIVER_PATH);
BrowserMobProxy browserMobProxy = new BrowserMobProxyServer();
browserMobProxy.start();
browserMobProxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);
browserMobProxy.setHarCaptureTypes(CaptureType.RESPONSE_CONTENT);
browserMobProxy.newHar("kk");
Proxy seleniumProxy = ClientUtil.createSeleniumProxy(browserMobProxy);
// 实例化Selenium
ChromeOptions options = new ChromeOptions();
options.setProxy(seleniumProxy);
options.setAcceptInsecureCerts(true);
//options.setExperimentalOption("debuggerAddress", "127.0.0.1:9222");
WebDriver driver = new ChromeDriver(options);
// 网络请求回调函数
browserMobProxy.addRequestFilter(new RequestFilter() {
@Override
public HttpResponse filterRequest(HttpRequest httpRequest, HttpMessageContents httpMessageContents, HttpMessageInfo httpMessageInfo) {
// 打印请求信息
// log.info("request=>" + httpMessageInfo.getUrl());
return null;
}
});
// 网络响应回调函数
browserMobProxy.addResponseFilter(new ResponseFilter() {
@Override
public void filterResponse(HttpResponse httpResponse, HttpMessageContents httpMessageContents, HttpMessageInfo httpMessageInfo) {
// 这里获取打印的信息
log.info(httpMessageInfo.getUrl());
if(httpMessageInfo.getUrl().equals("https://pub.alimama.com/openapi/param2/1/gateway.unionpub/union.pub.entry")) {
// 格式化输出
String str = JSONObject.toJSONString(httpMessageContents.getTextContents(), true);
System.out.println(str);
// 将数据写到文件中
try {
FileWriter writer = new FileWriter("output.txt");
writer.write(str);
} catch (IOException e) {
e.printStackTrace();
}
}
}
});
// 打开网页
driver.get("https://pub.alimama.com/portal/v2/pages/promo/goods/index.htm?pageNum=2");
}
@Override
public List getGoodInfo() {
// 加载chrome驱动
System.setProperty("webdriver.chrome.driver", DRIVER_PATH);
ChromeOptions options = new ChromeOptions();
options.setExperimentalOption("debuggerAddress", "127.0.0.1:9222");
// 启动浏览器
WebDriver driver = new ChromeDriver(options);
// 设置最长等待时间
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
// 实例化一个列表存放数据
List rstList = new ArrayList<>();
// 开始遍历卡片数据
// 遍历100组数据暂停
for(int i=0; i<100; ) {
WebElement element = driver.findElement(By.className("GoodsList__CardList-sc-84so0w-1"));
List divList = element.findElements(By.className("union-good-card-wrap"));
log.info("获取" + divList.size() + "个优惠券卡片");
for(int j=0; j< divList.size(); j++) {
GoodItem item = new GoodItem();
// 图片url
item.setImgUrl(divList.get(j).findElement(By.className("union-good-card-good-img-wrap-mediumn"))
.findElement(By.tagName("a")).findElement(By.tagName("img")).getDomAttribute("src"));
// 优惠券标题
item.setTitle(divList.get(j).findElement(By.className("union-good-card-title"))
.findElement(By.tagName("span")).getText());
// 券前价格
item.setPrePrice(getPrice(divList.get(j)
.findElement(By.className("union-good-card-coupon-reserve-price-mediumn"))));
// 券后价格
item.setPostPrice(getPrice(divList.get(j)
.findElement(By.className("union-good-card-coupon-final-price"))));
List commissionList = divList.get(j).findElements(By.className("union-good-card-commision-info-item"));
// 佣金率
item.setCommissionRate(getPrice(commissionList.get(0)));
// 佣金
item.setCommission(getPrice(commissionList.get(1)));
log.info(JSON.toJSONString(item));
i++;
if(i == 100) {
log.info("100条数据获取完毕");
return rstList;
}
}
// 切换到下一页
driver.findElement(By.className("GoodsList__Pagination-sc-84so0w-2"))
.findElement(By.className("mux-pagination-icon-next")).click();
log.info("进入到下一页");
}
return rstList;
}
// 获取券前券后价格
private Double getPrice(WebElement element) {
StringBuilder sb = new StringBuilder();
sb.append(element.findElement(By.className("union-number-format-integer")).getText().replaceAll(",", ""));
sb.append(element.findElement(By.className("union-number-format-pointer")).getText());
sb.append(element.findElement(By.className("union-number-format-decimal")).getText());
Double price = Double.parseDouble(sb.toString());
return price;
}
}
数据库部分我们简单建一个表存数据,然后再建一些索引方便去重和加速查询
use coupon_db;
/* 文章信息表 */
drop table if exists t_coupon;
create table t_coupon(
u_id bigint(20) unsigned NOT NULL AUTO_INCREMENT PRIMARY KEY COMMENT '优惠券id',
title varchar(255) NOT NULL COMMENT '优惠券标题',
img_url varchar(500) NOT NULL COMMENT '图片的url',
pre_price double NOT NULL COMMENT '券前价格',
post_price double NOT NULL COMMENT '券后价格',
preferential_price double NOT NULL COMMENT '优惠价格',
preferential_rate double NOT NULL COMMENT '优惠率',
commission_rate double NOT NULL COMMENT '佣金率',
commission double NOT NULL COMMENT '佣金',
recommend varchar(500) NOT NULL COMMENT '淘口令',
serialize_time varchar(50) NOT NULL COMMENT '创建的时间'
) ENGINE=InnoDB COMMENT '优惠券信息表';
/* 创建标题唯一索引,通过标题判断优惠券的唯一性 */
create unique index title_index on t_coupon (title);
/* 创建普通索引,用于加速查询 */
create index preferential_price_index on t_coupon (preferential_price);
create index serialize_time_index on t_coupon (serialize_time);
create index commission_index on t_coupon (commission);
create index commission_rate_index on t_coupon (commission_rate);
本文就简单写三个mapper接口来存数据和查数据
@Mapper
public interface GoodMapper {
// 清空表
public void clearTable();
// 插入一条数据
public void insertOneItem(@Param("item") GoodItem item);
// 查询数据,按照优惠价格降序排序
public List selectByOrder(@Param("start") int start,
@Param("num") int num);
}
delete from t_coupon where 1=1
insert into t_coupon
(title, img_url, pre_price, post_price,
commission_rate, commission, recommend, serialize_time,
preferential_price, preferential_rate) values
(#{item.title}, #{item.imgUrl}, #{item.prePrice}, #{item.postPrice},
#{item.commissionRate}, #{item.commission}, #{item.recommend}, #{item.serializeTime},
#{item.preferentialPrice}, #{item.preferentialRate})
on duplicate key update title=title
我们将开启爬虫和查询的接口都写在一个Controller里面
@Controller
public class BootController {
@Autowired
private SeleniumPlusService seleniumPlusService;
@Autowired
private SelectService selectService;
@RequestMapping("/bootstart")
@ResponseBody
public String bootstart() {
// 创建一个线程去爬取优惠券数据
new Thread(()->{
seleniumPlusService.startSpider();
}).start();
return "success";
}
// 查询数据
@RequestMapping("/select/{start}/{num}")
@ResponseBody
public String selectByOrderDESC(@PathVariable("start") int start,
@PathVariable("num") int num) {
List goodItemList = selectService.selectByOrderDESC(start, num);
return JSON.toJSONString(goodItemList);
}
}
其中查询的Service非常简单,就是执行mapper中的查询接口然后将数据返回
@Service
public class SelectServiceImpl implements SelectService {
@Autowired
private GoodMapper goodMapper;
@Override
public List selectByOrderDESC(int start, int num) {
return goodMapper.selectByOrder(start, end);
}
}
查询接口是一个GET请求,请求参数是按照优惠价格降序排序后的数据。
比如下图执行的请求接口是 localhost:8080/select/0/10 ,查询数据库里面优惠价格最高的10条数据
上述的查询对应的sql语句如下。
在SQL语句中 limit 后面的两个参数分别是开始的索引和查询的数据量,比如下面这条SQL语句的意思应该是从第0条数据开始查,一共查出10条数据。
select * from t_coupon
order by preferential_price DESC
limit 0, 10
查询出来的数据如下
[{
"commission": 2.64,
"commissionRate": 5.3,
"imgUrl": "//img.alicdn.com/bao/uploaded/i3/6000000002126/O1CN01Z5K9L61RZktaDO7mX_!!6000000002126-0-sm.jpg",
"postPrice": 49.9,
"prePrice": 99.0,
"preferentialPrice": 49.1,
"preferentialRate": 0.496,
"recommend": "立白天然茶籽洗衣液6KG 家庭实惠洗衣除菌除螨 89.00元\n本月上新\n买它!超值!\nh:/89¥ CZ3457 tptfdGVZ6Wj¥\n",
"serializeTime": "2023-07-16 13:16:28",
"title": "立白天然茶籽洗衣液6KG 家庭实惠洗衣除菌除螨"
}, {
"commission": 7.11,
"commissionRate": 9.0,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN010BdQ3w1eaIJCsydxO_!!6000000003887-0-yinhe.jpg",
"postPrice": 49.5,
"prePrice": 89.0,
"preferentialPrice": 39.5,
"preferentialRate": 0.4438,
"recommend": "Joocyee酵色唇釉琥珀唇彩丝绒口红哑光贝壳镜面唇泥太妃糖复古女 79.00元\n历史热推\n速速抢购,手快有,手慢无!\nh:/68¥ CZ0001 30W9dGVaz77¥\n",
"serializeTime": "2023-07-16 13:16:41",
"title": "Joocyee酵色唇釉琥珀唇彩丝绒口红哑光贝壳镜面唇泥太妃糖复古女"
}, {
"commission": 0.67,
"commissionRate": 1.35,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01oDxvso1iw3dVd7jec_!!6000000004476-0-yinhe.jpg",
"postPrice": 49.9,
"prePrice": 84.9,
"preferentialPrice": 35.0,
"preferentialRate": 0.4122,
"recommend": "蒙牛特仑苏纯牛奶250ml*16盒整箱学生早餐奶高端(新旧包装混发) 82.90元\n超十万人正在疯抢\n喜欢的宝宝们千万不要错过哦~赶紧买起来买起来!~\nh:/49¥ CZ3457 kbB6dGeADLs¥\n",
"serializeTime": "2023-07-16 13:16:01",
"title": "蒙牛特仑苏纯牛奶250ml*16盒整箱学生早餐奶高端(新旧包装混发)"
}, {
"commission": 0.9,
"commissionRate": 1.5,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01hCQXtN1Oc5yax1WHF_!!6000000001725-0-yinhe.jpg",
"postPrice": 46.57,
"prePrice": 79.9,
"preferentialPrice": 33.33,
"preferentialRate": 0.4171,
"recommend": "【零感003】杰士邦避孕套正品官方旗舰店安全套超薄男用裸入持久1 59.90元\n赠运费险\n买它就对了~\nh:/77¥ CZ3457 Y5i7dGVZj30¥\n",
"serializeTime": "2023-07-16 13:16:33",
"title": "【零感003】杰士邦避孕套正品官方旗舰店安全套超薄男用裸入持久1"
}, {
"commission": 0.33,
"commissionRate": 0.3,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01BvRfuK22sf0fmsdtn_!!6000000007176-0-yinhe.jpg",
"postPrice": 109.0,
"prePrice": 139.0,
"preferentialPrice": 30.0,
"preferentialRate": 0.2158,
"recommend": "罗马仕20000毫安充电宝双自带线双向快充大容量1万小巧移动电源闪充适用华为苹果iphone小米oppo手机专用户外 109.00元\n超十万人正在疯抢\n快~少量库存!!赶紧冲拼手速!!\nh:/19¥ CZ0001 0GK2dGVayGy¥\n",
"serializeTime": "2023-07-16 13:16:24",
"title": "罗马仕20000毫安充电宝双自带线双向快充大容量1万小巧移动电源闪充适用华为苹果iphone小米oppo手机专用户外"
}, {
"commission": 1.6,
"commissionRate": 1.8,
"imgUrl": "//img.alicdn.com/bao/uploaded/i1/2200828292428/O1CN01f5SKRV1To4V1gBrc1_!!2200828292428.jpg",
"postPrice": 89.0,
"prePrice": 118.0,
"preferentialPrice": 29.0,
"preferentialRate": 0.2458,
"recommend": "逐本清欢晨蜜自在自然植萃卸妆油敏弱肌脸部舒缓深层清洁卸妆水膏 89.00元\n回头客1万+\n质量逆天,赶紧的,闭眼买都不亏!!\nh:/59¥ CZ0001 JrpUdGVafyH¥\n",
"serializeTime": "2023-07-16 13:16:06",
"title": "逐本清欢晨蜜自在自然植萃卸妆油敏弱肌脸部舒缓深层清洁卸妆水膏"
}, {
"commission": 0.69,
"commissionRate": 1.35,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01ry5fh31G8llXLIPuR_!!6000000000578-0-yinhe.jpg",
"postPrice": 50.9,
"prePrice": 74.4,
"preferentialPrice": 23.5,
"preferentialRate": 0.3159,
"recommend": "蒙牛纯牛奶全脂灭菌乳250ml*24盒/1箱学生营养早餐搭配优质乳蛋白 71.40元\n回头客12万+\n买它就对了~\nh:/97¥ CZ3457 nlVhdGVZUPV¥\n",
"serializeTime": "2023-07-16 13:16:10",
"title": "蒙牛纯牛奶全脂灭菌乳250ml*24盒/1箱学生营养早餐搭配优质乳蛋白"
}, {
"commission": 2.21,
"commissionRate": 4.5,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01gO1IfQ1ljdhW0a0LT_!!6000000004855-0-yinhe.jpg",
"postPrice": 49.0,
"prePrice": 69.0,
"preferentialPrice": 20.0,
"preferentialRate": 0.2899,
"recommend": "蕉下修容口罩EM320 防晒护眼角开车面罩防紫外线立体夏女透气户外 49.00元\n好评过万\n不要犹豫!库存不多抓紧抢!\nh:/59¥ CZ0001 LYJSdGVZeik¥\n",
"serializeTime": "2023-07-16 13:16:37",
"title": "蕉下修容口罩EM320 防晒护眼角开车面罩防紫外线立体夏女透气户外"
}, {
"commission": 3.05,
"commissionRate": 10.5,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01siAhJN1Hwyo2vfWAr_!!6000000000823-0-yinhe.jpg",
"postPrice": 29.0,
"prePrice": 49.0,
"preferentialPrice": 20.0,
"preferentialRate": 0.4082,
"recommend": "猫人抗菌裆男士内裤男冰丝无痕四角裤运动纯棉裆平角大码裤青少年 29.00元\n回头客2万+\n质量逆天,赶紧的,闭眼买都不亏!!\nh:/17¥ CZ0001 ZxlhdGVaNAb¥\n",
"serializeTime": "2023-07-16 13:16:51",
"title": "猫人抗菌裆男士内裤男冰丝无痕四角裤运动纯棉裆平角大码裤青少年"
}, {
"commission": 1.79,
"commissionRate": 6.0,
"imgUrl": "//img.alicdn.com/bao/uploaded/O1CN01PyEz521NEuMNqT3Av_!!6000000001539-0-yinhe.jpg",
"postPrice": 29.9,
"prePrice": 49.9,
"preferentialPrice": 20.0,
"preferentialRate": 0.4008,
"recommend": "【百亿补贴】进口茱蒂丝巧克力夹心饼干纯可可脂儿童健康休闲零食 29.90元\n近7天浏览过万\n这价位能做到这样真的无可挑剔!\nh:/98¥ CZ0001 eJUpdGVZvWR¥\n",
"serializeTime": "2023-07-16 13:17:51",
"title": "【百亿补贴】进口茱蒂丝巧克力夹心饼干纯可可脂儿童健康休闲零食"
}]
可以看到有一些优惠券的优惠力度还是挺大的,可以省几十元,就是不知道这个券前价格准不准哈哈哈。