虎牙、B站网页信息python抓取试试

利用虎牙和b站的网页来抓取用户及其粉丝数量

使用时要注意:
(1)cmd 命令下 python test.py
后面有参数,如果没敲参数,会有提示,如:

useage: python test.py    
param num is less than 5, num is 1

含义是:起始房间号、搜索组的大小、线程数、循环次数
搜索的总共房间数 = 搜索组的大小 * 线程数 * 循环次数
(2)结果
结果放在文件 data/target.txt
注意事先创建文件夹 data,否则提示没有 data/target.txt
(3)例子(虎牙):

================= new record ==================
time:2022-02-20 10:18:55
roomIdStart:1000
roomIdInterval:10
multiProcessNum:18
loopTimes:2
range[1000:1360]
===============================================
1066 电子厂-心态 7945900 1539218884
1123 奇领颜韵Ycy【万徒】 117598 1199552286636
==========================
总共时18.54秒

##################################
# 每日一抓:  虎牙粉丝排行榜数据 #
##################################
from urllib.request import urlopen
import sys



#创建(覆盖)文件
def create_file(name):
    f = open(name, "w")
    f.close();

#数据存入文件
def save_data_to_file(fileName, bufUtf8):
    f = open(fileName, "ab")
    f.write(bufUtf8)



#抓取网页数据2 虎牙视频网页,抓取订阅数
#例子: https://v.huya.com/u/1199553057095

def get_subscribe_num_by_idstr(idstr):
    videoBaseUrl = 'https://v.huya.com/u/'
    subscribeMark = "                        订阅:"
    subscribeEndMark = "\r\n"
    url = videoBaseUrl + idstr
    cnt = 0;
    result = "0"
    for line in urlopen(url):
        cnt += 1
        #限定有用数据范围,为了节省时间
        if cnt < 180 or cnt > 280 :
            continue
        line_str = line.decode(encoding = "utf-8")
        length = len(line_str);
        if line_str.find(subscribeMark) != -1:
            result = line_str[len(subscribeMark): length - len(subscribeEndMark)]
    return result

#抓取网页数据
#例子: https://www.huya.com/298039

def web_content_pro(url, roomId, fileName):
    anchorMark = "

\r\n" subscribeMark = "
" subscribeEndMark = "
\r\n" videoMark = " 视频\r\n" anchor = "" subscribe = "0" video = "" marka = 0; markb = 0; markc = 0; cnt = 0 for line in urlopen(url): cnt += 1 #限定有用数据范围,为了节省时间 if cnt < 100 or cnt > 200 : continue line_str = line.decode(encoding = "utf-8") length = len(line_str); #主播名 if marka == 0 and line_str.find(anchorMark) != -1: marka = 1; anchor = line_str[line_str.find(">") + 1: length - len(anchorEndMark)] #订阅量 if markb == 0 and line_str.find(subscribeMark) != -1: markb = 1 subscribe = line_str[len(subscribeMark): length - len(subscribeEndMark)] #视频对应的id if markc == 0 and line_str.find(videoMark) != -1: markc = 1 video = line_str[len(videoMark): length - len(videoEndMark)] if marka == 1 and markb == 1 and markc == 1 : break; if(len(anchor) > 0): #resultOut = "roomId:" + str(roomId) + "==>anchor:" + anchor + ";subscribe:" + subscribe + ";video:" + video endMark = "" if subscribe == '0': subscribe = get_subscribe_num_by_idstr(video) endMark = "\t[alarm:subscribe=0]" resultOut = str(roomId) + "\t" + anchor + "\t" + subscribe + "\t" + video + endMark print(resultOut); resultOut += "\n"; save_data_to_file(fileName, resultOut.encode(encoding = "utf-8")) else: print("roomId:" + str(roomId) + "==>【未找到】") #抓取数据 def catch_data(fileName, url, start, num): for x in range(num): urlTmp = urlStd + str(start + x) web_content_pro(url + str(start + x), start + x, fileName); #并发任务 from multiprocessing import Process from os import getpid urlStd = "https://www.huya.com/" #catchStart = 298039 #catchStart = 521000 #catchNum = 2 fileForSaveData = "data/fansData" def catch_data_task(index, start, num): newName = fileForSaveData + str(index) + ".txt" create_file(newName) catch_data(newName, urlStd, start, num) #目标文件添加内容 fileForSaveData = "data/fansData" targetFileName = "data/target.txt" def target_file_add_content(num): fin = open(targetFileName, "ab") for x in range(num): newName = fileForSaveData + str(x) + ".txt" fout = open(newName, "rb") fin.write(fout.read()) fout.close() fin.close() #主函数 #注:1000个房间号,20个线程,实测耗时 430s 、 380s def task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes): ''' roomIdStart = 2000 roomIdInterval = 10 multiProcessNum = 20 loopTimes = 10 ''' import time fin = open(targetFileName, "ab") title = "\n================= new record ==================\n" timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) title += "time:" + timestr + "\n"; title += "roomIdStart:" + str(roomIdStart) + "\nroomIdInterval:" + str(roomIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n" rangeStr = "range[" + str(roomIdStart) + ":" + str(roomIdStart + roomIdInterval * multiProcessNum * loopTimes) + "]" title += rangeStr + "\n" title += "===============================================\n" fin.write(title.encode(encoding="utf8")) fin.close() from time import time start = time() for loop in range(loopTimes): proce = [] loopStart = roomIdStart + loop * roomIdInterval * multiProcessNum for x in range(multiProcessNum): proce.append(Process(target=catch_data_task, args=(x, loopStart + (x * roomIdInterval), roomIdInterval ))) proce[x].start() for x in proce: x.join() target_file_add_content(multiProcessNum) end = time() timeStr = '==========================\n总共时%.2f秒' % (end - start) print(timeStr) fin = open(targetFileName, "ab") fin.write(timeStr.encode(encoding="utf8")) fin.close() def main(): num = len(sys.argv) faild = 0 errorMsg = "useage: python " + sys.argv[0] + " " + " " + " " + " " if num <= 4: errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv)) print(errorMsg) faild = 1 if faild != 1: roomIdStart = int(sys.argv[1]) if roomIdStart < 0 or roomIdStart > 10000000: errorMsg += "\nparam roomIdStart is out of range, you input ==>" + sys.argv[1] print(errorMsg) faild = 1 if faild != 1: roomIdInterval = int(sys.argv[2]) if roomIdInterval < 0 or roomIdInterval > 10000: errorMsg += "\nparam roomIdInterval is out of range, you input ==>" + sys.argv[2] print(errorMsg) faild = 1 if faild != 1: multiProcessNum = int(sys.argv[3]) if multiProcessNum < 0 or multiProcessNum > 20: errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3] print(errorMsg) faild = 1 if faild != 1: loopTimes = int(sys.argv[4]) if loopTimes < 0 or loopTimes > 20: errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4] print(errorMsg) faild = 1 if faild != 1: print("your input ==>\nroomIdStart:%d roomIdInterval:%d multiProcessNum:%d loopTimes:%d\n"%(roomIdStart, roomIdInterval, multiProcessNum, loopTimes)) task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes) #============================= if __name__ == '__main__': main() #print(sys.argv[0]) #print(len(sys.argv))


##################################
#          测       试       :b站用户粉丝数量信息抓取(多次使用可能被短期禁止访问)    #
##################################
from urllib.request import urlopen
from multiprocessing import freeze_support,Lock,Process,Value
import sys

g_var_cnt=Value('i',0)
g_var_lock=Lock()


#创建(覆盖)文件
def create_file(name):
    f = open(name, "w")
    f.close()
    
#向文件写入内容
def write_to_file(name, bufIn):
    f = open(name, "a")
    f.write(bufIn)
    f.close()

#参考: 获取网页隐藏信息  https://blog.csdn.net/qq_38270802/article/details/90204609
#"https://space.bilibili.com/10558188"
#"https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp"
#"https://api.bilibili.com/x/relation/stat?vmid=10558188&jsonp=jsonp"
def get_user_info(totle, fileName, rangeStart, rangeEnd, cnt, lock):
    #fileName = "tmp.txt"
    
    urlForInfoStart = "https://api.bilibili.com/x/space/acc/info?mid="
    urlForInfoEnd = "&jsonp=jsonp"
    name = "[null]"
    nameMarkStart = "\"name\":\""
    nameMarkEnd = "\",\"sex\":\""
    
    urlForStatStart = "https://api.bilibili.com/x/relation/stat?vmid="
    urlForStatEnd = "&jsonp=jsonp"
    following = 0
    followingMarkStart = "\"following\":"
    followingMarkEnd = ",\"whisper\":"
    
    follow = 0
    followMarkStart = "\"follower\":"
    followMarkEnd = "}}"
    
    f = open(fileName, "w")
    
    for x in range(rangeStart, rangeEnd):
        #=================================================================用户昵称
        urlopen("https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp")
        rst = urlopen(urlForInfoStart + str(x) + urlForInfoEnd).read().decode("utf-8")
        if None != rst:
            #用户昵称
            pos1 = rst.find(nameMarkStart)
            pos2 = rst.find(nameMarkEnd)
            if pos1 != -1 and pos2 != -1:
                name = rst[pos1 + len(nameMarkStart):pos2]
        #=================================================================关注和粉丝数
        rst = urlopen(urlForStatStart + str(x) + urlForStatEnd).read().decode("utf-8")
        #print(rst)
        if None != rst:
            #关注数
            pos1 = rst.find(followingMarkStart)
            pos2 = rst.find(followingMarkEnd)
            if pos1 != -1 and pos2 != -1:
                following = int(rst[pos1 + len(followingMarkStart):pos2])
            #粉丝数
            pos1 = rst.find(followMarkStart)
            pos2 = rst.find(followMarkEnd)
            if pos1 != -1 and pos2 != -1:
                follow = int(rst[pos1 + len(followMarkStart):pos2])
        #=================================================================写入文件 
        with lock:
            cnt.value += 1
        #print(cnt.value, totle, cnt.value * 100 / totle, x, name, following, follow)
        outStr = "\r%d%%\t"%(cnt.value * 100 / totle)
        outStr += "%d\t%s\t%d\t%d"%(x, name, following, follow)
        lenTmp = len(outStr)
        for i in range(40 - lenTmp):
            outStr += " "       
        print(outStr, end="")
        bufIn = "%d\t%s\t%d\t%d\n"%(x, name, following, follow) 
        f.write(bufIn)
        
    f.close()
    print("")

def get_user_info_task(dataFileName, loopTimes, loop, index, multiProcessNum, start, interval, cnt, lock):
    print("=======================================>task[%d/%d]%d%%"%(loop + 1, loopTimes, loop * 100 / loopTimes))
    fileName = dataFileName + str(index) + ".txt"
    totle = multiProcessNum * interval
    get_user_info(totle, fileName, start, start + interval, cnt, lock)
    

def target_file_add_content(targetFileName, dataFileName, num):
    fin = open(targetFileName, "ab")
    for x in range(num):   
        newName = dataFileName + str(x) + ".txt"
        fout = open(newName, "rb")
        fin.write(fout.read())
        fout.close()
    fin.close()

def task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes):
    import time
    start = time.time()
    dataFileName = "data/biliData"
    targetFileName = "data/target.txt"
    
    fin = open(targetFileName, "ab")
    title = "\n================= new record ==================\n"
    timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    title += "time:" + timestr + "\n";
    title += "userIdStart:" + str(userIdStart) + "\nuserIdInterval:" + str(userIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
    rangeStr = "range[" + str(userIdStart) + ":" + str(userIdStart + userIdInterval * multiProcessNum * loopTimes) + "]"
    title += rangeStr + "\n"
    title += "===============================================\n"
    fin.write(title.encode(encoding="utf8"))
    fin.close()
  
    for loop in range(loopTimes):
        proce = []
        g_var_cnt.value = 0
        loopStart = userIdStart + loop * userIdInterval * multiProcessNum
        proce=[Process(target=get_user_info_task, args=(dataFileName, loopTimes, loop, i, multiProcessNum, loopStart + (i * userIdInterval), userIdInterval, g_var_cnt, g_var_lock,)) for i in range(multiProcessNum)]
        for x in proce:
            x.start()
        for x in proce:
            x.join()
        target_file_add_content(targetFileName, dataFileName, multiProcessNum)
    end = time.time()
    timeStr = '\n==========================\n总共时%.2f秒, 搜索%d位用户[%d:%d], 实际搜索%d(访问可能被禁止)' % (end - start, multiProcessNum * loopTimes * userIdInterval, userIdStart, userIdStart + userIdInterval * multiProcessNum * loopTimes, g_var_cnt.value)
    print(timeStr)
    fin = open(targetFileName, "a")
    fin.write(timeStr)
    fin.close()


def main():
    num = len(sys.argv)
    faild = 0
    errorMsg = "useage: python " + sys.argv[0] + " " + " " + " " + " "
    if num <= 4:
        errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
        print(errorMsg)
        faild = 1
    if faild != 1:
        userIdStart = int(sys.argv[1])
        if userIdStart < 0 or userIdStart > 100000000:
            errorMsg += "\nparam userIdStart is out of range, you input ==>" + sys.argv[1]
            print(errorMsg)
            faild = 1
    if faild != 1:
        userIdInterval = int(sys.argv[2])
        if userIdInterval < 0 or userIdInterval > 10000:
            errorMsg += "\nparam userIdInterval is out of range, you input ==>" + sys.argv[2]
            print(errorMsg)
            faild = 1
    if faild != 1:
        multiProcessNum = int(sys.argv[3])
        if multiProcessNum < 0 or multiProcessNum > 20:
            errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
            print(errorMsg)
            faild = 1
    if faild != 1:
        loopTimes = int(sys.argv[4])
        if loopTimes < 0 or loopTimes > 20:
            errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
            print(errorMsg)
            faild = 1
    if faild != 1:
        print("============================================================")
        print("                       task  start                          ")
        print("userIdStart:%d userIdInterval:%d multiProcessNum:%d loopTimes:%d"%(userIdStart, userIdInterval, multiProcessNum, loopTimes))
        print("============================================================")
        task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes)


if __name__ == '__main__':
    main()

##################################
# 每日一抓:  虎牙粉丝排行榜数据 #
# 更新时间:2022年10月3日10:19:39#
##################################
'''
注意:
(1)网页数据里的关键信息(主播名、订阅量、视频页面id等)所在位置可能会更改,需要
根据实际情况修改,例如变量anchorMark、anchorEndMark、subscribeMark、subscribeEndMark、
videoMark、videoEndMark
(2)主播的直播间有房间号roomid,视频页有视频也的序号videoid,有时候
直播间页面出现订阅量为0的情况,这时候可以借助于查看视频页面的订阅量信息,
并给出了警告 [alarm:subscribe=0]  
(3)变量 
python a.py [房间起始id] [一组房间的个数] [线程数] [循环次数,也可称为房间组数]
useage: python a.py    
总共搜索数 roomIdInterval * multiProcessNum * loopTimes
(4)例子
H:\test>python a.py 520520 2 2 2
your input ==>
roomIdStart:520520 roomIdInterval:2 multiProcessNum:2 loopTimes:2

520520  骚男    21091778        900821317
520522  9oo58丶阿囍     56711   1837296775
roomId:520521==>【未找到】
520523  伦桑    576206  570241969
520524  Yz丶混世【524】 336845  1756485620
520526  唯爱瑾宝        27501   1863891482
roomId:520525==>【未找到】
roomId:520527==>【未找到】
==========================
总共时1.86秒

'''

from urllib.request import urlopen
import sys

#创建(覆盖)文件
def create_file(name):
    f = open(name, "w")
    f.close();

#数据存入文件
def save_data_to_file(fileName, bufUtf8):
    f = open(fileName, "ab")
    f.write(bufUtf8)



#抓取网页数据2 虎牙视频网页,抓取订阅数
#例子: https://v.huya.com/u/1199553057095

def get_subscribe_num_by_idstr(idstr):
    videoBaseUrl = 'https://v.huya.com/u/'
    subscribeMark = "                        订阅:"
    subscribeEndMark = "\r\n"
    url = videoBaseUrl + idstr
    cnt = 0;
    result = "0"
    for line in urlopen(url):
        cnt += 1
        #限定有用数据范围,为了节省时间
        if cnt < 180 or cnt > 280 :
            continue
        line_str = line.decode(encoding = "utf-8")
        length = len(line_str);
        if line_str.find(subscribeMark) != -1:
            result = line_str[len(subscribeMark): length - len(subscribeEndMark)]
    return result

#抓取网页数据
#例子: https://www.huya.com/298039


def web_content_pro(url, roomId, fileName):
    '''
    anchorMark = "

\r\n" subscribeMark = "
" subscribeEndMark = "
\r\n" videoMark = " 视频\r\n" ''' anchorMark = "

" subscribeMark = "
" subscribeEndMark = "
" videoMark = " 200 : if cnt < 0 or cnt > 12 : continue line_str = line.decode(encoding = "utf-8") length = len(line_str); #主播名 if marka == 0: pos1 = line_str.find(anchorMark) if pos1 != -1: tmpStr = line_str[pos1 : length] pos2 = tmpStr.find(anchorEndMark) if pos2 != -1: marka = 1 anchor = tmpStr[len(anchorMark) : pos2] #print(anchor) #订阅量 if markb == 0: pos1 = line_str.find(subscribeMark) if pos1 != -1: tmpStr = line_str[pos1 : length] pos2 = tmpStr.find(subscribeEndMark) if pos2 != -1: markb = 1 subscribe = tmpStr[len(subscribeMark) : pos2] #print(subscribe) #视频对应的id if markc == 0: pos1 = line_str.find(videoMark) if pos1 != -1: tmpStr = line_str[pos1 : length] pos2 = tmpStr.find(videoEndMark) if pos2 != -1: markc = 1 video = tmpStr[len(videoMark) : pos2] #print(video) if marka == 1 and markb == 1 and markc == 1 : break; if(len(anchor) > 0): #resultOut = "roomId:" + str(roomId) + "==>anchor:" + anchor + ";subscribe:" + subscribe + ";video:" + video endMark = "" if subscribe == '0': subscribe = get_subscribe_num_by_idstr(video) endMark = "\t[alarm:subscribe=0]" resultOut = str(roomId) + "\t" + anchor + "\t" + subscribe + "\t" + video + endMark print(resultOut); resultOut += "\n"; save_data_to_file(fileName, resultOut.encode(encoding = "utf-8")) else: print("roomId:" + str(roomId) + "==>【未找到】") #抓取数据 def catch_data(fileName, url, start, num): for x in range(num): urlTmp = urlStd + str(start + x) web_content_pro(url + str(start + x), start + x, fileName); #并发任务 from multiprocessing import Process from os import getpid urlStd = "https://www.huya.com/" #catchStart = 298039 #catchStart = 521000 #catchNum = 2 fileForSaveData = "data/fansData" def catch_data_task(index, start, num): newName = fileForSaveData + str(index) + ".txt" create_file(newName) catch_data(newName, urlStd, start, num) #目标文件添加内容 fileForSaveData = "data/fansData" targetFileName = "data/target.txt" def target_file_add_content(num): fin = open(targetFileName, "ab") for x in range(num): newName = fileForSaveData + str(x) + ".txt" fout = open(newName, "rb") fin.write(fout.read()) fout.close() fin.close() #主函数 #注:1000个房间号,20个线程,实测耗时 430s 、 380s def task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes): ''' roomIdStart = 2000 roomIdInterval = 10 multiProcessNum = 20 loopTimes = 10 ''' import time fin = open(targetFileName, "ab") title = "\n================= new record ==================\n" timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) title += "time:" + timestr + "\n"; title += "roomIdStart:" + str(roomIdStart) + "\nroomIdInterval:" + str(roomIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n" rangeStr = "range[" + str(roomIdStart) + ":" + str(roomIdStart + roomIdInterval * multiProcessNum * loopTimes) + "]" title += rangeStr + "\n" title += "===============================================\n" fin.write(title.encode(encoding="utf8")) fin.close() from time import time start = time() for loop in range(loopTimes): proce = [] loopStart = roomIdStart + loop * roomIdInterval * multiProcessNum for x in range(multiProcessNum): proce.append(Process(target=catch_data_task, args=(x, loopStart + (x * roomIdInterval), roomIdInterval ))) proce[x].start() for x in proce: x.join() target_file_add_content(multiProcessNum) end = time() timeStr = '==========================\n总共时%.2f秒' % (end - start) print(timeStr) fin = open(targetFileName, "ab") fin.write(timeStr.encode(encoding="utf8")) fin.close() def main(): num = len(sys.argv) faild = 0 errorMsg = "useage: python " + sys.argv[0] + " " + " " + " " + " " if num <= 4: errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv)) print(errorMsg) faild = 1 if faild != 1: roomIdStart = int(sys.argv[1]) if roomIdStart < 0 or roomIdStart > 10000000: errorMsg += "\nparam roomIdStart is out of range, you input ==>" + sys.argv[1] print(errorMsg) faild = 1 if faild != 1: roomIdInterval = int(sys.argv[2]) if roomIdInterval < 0 or roomIdInterval > 10000: errorMsg += "\nparam roomIdInterval is out of range, you input ==>" + sys.argv[2] print(errorMsg) faild = 1 if faild != 1: multiProcessNum = int(sys.argv[3]) if multiProcessNum < 0 or multiProcessNum > 20: errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3] print(errorMsg) faild = 1 if faild != 1: loopTimes = int(sys.argv[4]) if loopTimes < 0 or loopTimes > 20: errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4] print(errorMsg) faild = 1 if faild != 1: print("your input ==>\nroomIdStart:%d roomIdInterval:%d multiProcessNum:%d loopTimes:%d\n"%(roomIdStart, roomIdInterval, multiProcessNum, loopTimes)) task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes) #============================= if __name__ == '__main__': main() #print(sys.argv[0]) #print(len(sys.argv))

你可能感兴趣的:(虎牙、B站网页信息python抓取试试)