直接上爬虫代码
如果你对python感兴趣,我这有个学习Python基地,里面有很多学习资料,感兴趣的+Q群:688244617
import requests
import re
tmpt_url = 'https://weibo.com/p/1005051678105910/follow?page=%d#Pl_Official_HisRelation__59'
def get_data(tmpt_url):
urllist = [tmpt_url%i for i in range(1,6)]
user_id = [] #粉丝ID
user_name = [] #粉丝名称
user_follow = [] #粉丝的关注
user_fans = [] #粉丝的粉丝量
user_address = [] #粉丝的地址
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'keep-alive',
'Cookie':'请在自己的浏览器中查看,因涉及个人隐私不公开',
'Host':'weibo.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
for url in urllist:
html = requests.get(url,headers=headers).text
user_id.extend(re.findall(r'\S+<\\/a>',html))
user_name.extend(re.findall(r'(\S+)<\\/a>',html))
user_follow.extend(re.findall(r'关注 (\d+)<\\/a>',html))
user_fans.extend(re.findall(r'粉丝(\d+)<\\/a>',html))
user_address.extend(re.findall(r'地址<\\/em>(\S+\s?\S+?)<\\/span>\\r\\n\\t\\t\\t\\t\\t<\\/div>',html))
print('user_id',user_id)
print('user_name',user_name)
print('user_follow',user_follow)
print('user_fans',user_fans)
print('user_address',user_address)
这个url是孙俪的微博账号
下面是她粉丝列表前5页爬到的信息,包括:粉丝ID,粉丝名称,粉丝的关注,粉丝的粉丝量,粉丝的地址