某音粉丝抓取技术探讨

        本着学习研究之目的,做了一个提取工具,实验了下提取效率挺高,3s左右可以提取到20个fans的信息,一天提取十几W信息不是梦。。。

        目前很多爬虫,无法做到连续爬取一个用户的粉丝信息,爬取完前几页后总是会出现获取不到数据的情况,本工具针对这种情况特意做了优化,实现了连续不间断爬取,主要思路就是通过注册新设备号解决。

上图:

某音粉丝抓取技术探讨_第1张图片

前用的ng+electron,贴一下代码:

   
import { Component,NgZone } from '@angular/core';
import { WebsqlService } from './service/websql.service';
import { HttpService } from './service/http.service';
import { NzMessageService } from 'ng-zorro-antd/message';
declare var XLSX: any;


@Component({
  selector: 'app-root',
  templateUrl: './app.component.html',
  styleUrls: ['./app.component.css']
})
export class AppComponent {
  token = "xxx";
  money = "";
  uid = "";
  pip = "";
  count = 0;
  more = false;
  cursor = 0;
  isRunning = false;
  constructor(
    private _ngZone: NgZone,
    private websqlService:WebsqlService,
    private http:HttpService,
    private message:NzMessageService){
    this.websqlService.createTB();
    this.getMoney();
    this.getLastData();
  }

  getLastData(){
    const me = this;
    me.websqlService.findLastAll(function(data){
      if(data&& data.rows&&data.rows.length>0){
        me._ngZone.run(()=>{
          me.uid = data.rows.item(0).uid;
          me.count = data.rows.item(0).count;
          me.more = data.rows.item(0).more;
          me.cursor = data.rows.item(0).min;
        });
      }
    });
  }

  getMoney(){
    const me = this;
    me.http.get("/v2/account/query",{token:me.token},
    function(data){
      me.money = data.money;
    },function(msg){});
  }
  /**
   * 全新开始
   * @param cursor 
   */
  startNewCollect(cursor){
    if(this.uid == ""){
      this.message.create("error","请输入UID后再采集...");
      return;
    }
    if(this.pip == ""){
      this.message.create("error","请输入代理IP后再采集...");
      return;
    }
    let param = {
      token:this.token,
      uid:this.uid,
      cursor:cursor,
      ip:this.pip
    };
    this.isRunning = true;
    const me = this;
    me.http.get("/v2/douyin/user/follower",param,
    function(data){
      if(data && data.data && data.data.msg=="No results."){
        me.startNewCollect(0);
      }else{
        data.data.followers.forEach(fans => {
          me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
        });
        me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
        me.getLastData();
        if(data.data.has_more){
          setTimeout(function(){
            me.startLastCollect();
          }, 1000)
        }else{
          me.isRunning = false;
        }
      }
    },function(msg){
      me.startNewCollect(0);
    });
  }
  /**
   * 继续上一次
   */
  startLastCollect(){
    this.getLastData();
    if(this.uid == ""){
      this.message.create("error","请输入UID后再采集...");
      return;
    }
    if(this.pip == ""){
      this.message.create("error","请输入代理IP后再采集...");
      return;
    }
    
    this.getMoney();
    if(!this.more){return;}
    let param = {
      token:this.token,
      uid:this.uid,
      cursor:this.cursor,
      ip:this.pip
    };
    this.isRunning = true;
    const me = this;
    me.http.get("/v2/douyin/user/follower",param,
    function(data){
      if(data && data.data && data.data.msg=="No results."){
       me.startLastCollect();
      }else{
        data.data.followers.forEach(fans => {
          me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
        });
        me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
        me.getLastData();
        if(data.data.has_more && me.isRunning){
          setTimeout(function(){
            me.startLastCollect();
          }, 1000)
        }else{
          me.isRunning = false;
        }
      }
    },function(msg){
      me.startLastCollect();
    });
  }
  /**
   * 停止
   */
  stopCollect(){
    this.isRunning = false;
  }

  exportExcel(){
    let list = [];
    const me = this;
    me.websqlService.findUserAll(function(data){
        if(data&& data.rows&&data.rows.length>0){
          for(var i=0;i

比较完美,主要是中间不会中断,可以在不中断的前提现更换代理IP。

你可能感兴趣的:(某音粉丝抓取技术探讨)