webmagic爬取B站用户信息

  通过谷歌浏览器开发模式找到了B站用户信息的口:https://space.bilibili.com/ajax/member/GetInfo 接口请求方式为post,需传入参数mid,mid为用户id,通过测试猜测mid后台保存为自动自增生成。

  开发环境: JDK1.8+Spring Boot + webmagic + mysql + mybatis plus

  在测试过程中发现 B站后台对接口调用的来源会检测限制,故在请求接口时添加请求头Referer:https://space.bilibili.com,关键代码如下:

public class BiliSpider2 implements PageProcessor {

    @Autowired
    private SpiderService spiderService;

    private Site site = Site.me().setTimeOut(10000).setRetryTimes(3)
    .setSleepTime(10000).setCharset("UTF-8");

  @Override
    public void process(Page page) {
        String status = new JsonPathSelector("$.status").select(page.getRawText());
        if(Boolean.valueOf(status)){
            try {
                String mid = new JsonPathSelector("$.data.mid").select(page.getRawText());
                String regtime = new JsonPathSelector("$.data.regtime").select(page.getRawText());
                String name = new JsonPathSelector("$.data.name").select(page.getRawText());
                String sex = new JsonPathSelector("$.data.sex").select(page.getRawText());
                String rank = new JsonPathSelector("$.data.rank").select(page.getRawText());
                String face = new JsonPathSelector("$.data.face").select(page.getRawText());
                String spacesta = new JsonPathSelector("$.data.spacesta").select(page.getRawText());
                String birthday = new JsonPathSelector("$.data.birthday").select(page.getRawText());
                String sign = new JsonPathSelector("$.data.sign").select(page.getRawText());
                String currentLevel = new JsonPathSelector("$.data.level_info.current_level").select(page.getRawText());
                String vipType = new JsonPathSelector("$.data.vip.vipType").select(page.getRawText());
                String vipStatus = new JsonPathSelector("$.data.vip.vipStatus").select(page.getRawText());
                String fansbadge = new JsonPathSelector("$.data.fans_badge").select(page.getRawText());
                BiliUser biliUser = new BiliUser()
                        .setMid(Integer.valueOf(mid))
                        .setName(name)
                        .setBirthday(birthday)
                        .setCurrentLevel(Integer.valueOf(currentLevel))
                        .setFace(face)
                        .setFansBadge(Boolean.valueOf(fansbadge))
                        .setRank(Integer.valueOf(rank))
                        .setSex(sex)
                        .setSpacesta(spacesta)
                        .setSign(sign)
                        .setVipType(Integer.valueOf(vipType))
                        .setVipStatus(Integer.valueOf(vipStatus))
                        .setRegtime(DateUtils.dateConvertToLocalDateTime(new Date(Integer.valueOf(regtime))));
                page.putField("user", biliUser);
              
            }
        }
    }

  @Override
   public Site getSite() {
        Set acceptStatCode = new HashSet<>();
        acceptStatCode.add(200);
        site = site.setAcceptStatCode(acceptStatCode)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .setUserAgent(UserAgentUtils.radomUserAgent());

        return site;
    }
}

  这里是最简单的,当然可以在其中添加一些其他的规则,例如爬取返回的页面并不是我们期望的页面,当爬取过于频繁时,有的网站会对ip限制,返回错误页面,而在下载器中我们没有去验证,也不方便在其中验证,我们就可以在这里去处理。

  这里我将其做集成在spring boot

@Component
@Slf4j
public class SpiderService {
    @Autowired
    private BiliPipeline biliPipeline;
    @Autowired
    private BiliSpider2 biliSpider2;
    @Autowired
    private BiliUserService biliUserService;
    @Autowired
    private ProxyIpService proxyIpService;

    private Spider spider;
    @PostConstruct
    private void init(){
        spider = Spider.create(biliSpider2)
                .addPipeline(biliPipeline)
                .thread(10);
    }

    public void start(Integer count){
        biliPipeline.clean();
        Integer maxMid = biliUserService.getMaxMid();
        System.out.println(maxMid);
        if(maxMid == null){
            maxMid = 0;
        }
        for (int i = maxMid; i < maxMid+count; i++) {
            Request request = new Request("https://space.bilibili.com/ajax/member/GetInfo");
            request.setMethod(HttpConstant.Method.POST);
            Map map = new HashMap<>();
            map.put("mid", i+1);
            request.setRequestBody(HttpRequestBody.form(map,"utf-8"));
            request.addHeader("Referer", "https://space.bilibili.com");
            spider.addRequest(request);
        }
        HttpClientDownloader downloader = new HttpClientDownloader();
        List list = proxyIpService.list(new QueryWrapper()
                .orderByAsc("connect_speed")
                .last("limit 1000"));
        List proxyList = new ArrayList<>();
        for (ProxyIp proxyIp : list) {
            proxyList.add(new Proxy(proxyIp.getIp(), proxyIp.getPort()));
        }
        downloader.setProxyProvider(SimpleProxyProvider.from(proxyList.toArray(new Proxy[0])));
        spider.setDownloader(downloader);
        spider.start();
    }

    public void stop(){
        spider.stop();
    }

    public Spider getSpider() {
        return spider;
    }
}
@Slf4j
@RestController
public class BiliController {

    @Autowired
    private SpiderService spiderService;


    @GetMapping("/start/{count}")
    public void start(@PathVariable Integer count){
        if (count == null) count = 100;
        spiderService.start(count);

    }

    @GetMapping("/stop")
    public void stop(){
        spiderService.stop();
    }

}

你可能感兴趣的:(webmagic爬取B站用户信息)