1 漫画爬虫
当时觉得漫画不错
然后写了一个爬虫把漫画爬下来明天找时间看完
分享一下代码:
import requests
from bs4 import BeautifulSoup
import re
import os
from contextlib import closing
from tqdm import tqdm
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36"
,"Referer": "https://manhua.dmzj.com/guowangpaiming/116550.shtml"
}
def get_imgurl(url0,url1): #url0是该章节的首页 url1是该章节的第一个图片的地址
rsp = requests.get(url=url0, headers=header)
rsp.encoding = 'utf-8'
bs = BeautifulSoup(rsp.text, 'lxml')
script_info = bs.script
# 利用正则将script其中的图片页数提取出来
nums = re.findall('\|(\d{4})', str(script_info))
nums.sort(reverse=True)
urls = []
for num in nums:
urls.insert(0,url1[:-8] + num + ".jpg")
#返回一个章节的图片地址数组
return urls
def get_chapers(url0): #url0是该漫画的首页地址
change_ref(url0)
rsp = requests.get(url=url0, headers=header)
rsp.encoding = 'utf-8'
bshtml = BeautifulSoup(rsp.text, 'lxml')
chaperlist = bshtml.find('div', class_="cartoon_online_border")
chapers = chaperlist.find_all('a')
chaperurls = []
chapernames = []
for chaper in chapers:
chaperurls.insert(0, "https://manhua.dmzj.com/" + chaper.get('href'))
chapernames.insert(0, "国王排名" + chaper.text)
#返回漫画的所有章节首地址和章节名(数)
return chapernames,chaperurls
def get_firstimgurl(chapernum): #chapernum是该章节的章节数
url = "https://images.dmzj.com/g/%E5%9B%BD%E7%8E%8B%E6%8E%92%E5%90%8D/%E7%AC%AC"+chapernum+"%E5%8D%B7/0000.jpg"
#返回某章节第一个图片地址
return url
#该方法是因为我发现第一张的url不是卷编码 而是话编码,所以给第一章用这个函数
def get_firstimgurl1():
return "https://images.dmzj.com/g/%E5%9B%BD%E7%8E%8B%E6%8E%92%E5%90%8D/%E7%AC%AC01%E8%AF%9D/0000.jpg"
def down_img(url,num,dirname): #url是图片地址 ,num是图片编号,chapername是文件夹名
with closing(requests.get(url, headers=header, stream=True)) as response:
chunk_size = 1024
if response.status_code == 200:
with open(dirname+'/'+num+'.jpg', "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
else:
print('链接异常')
def change_ref(chaperurl):
global header
header['Referer'] = chaperurl
#url0是漫画首页
url0 = "https://manhua.dmzj.com/guowangpaiming"
#chapername是每一章的名字,chaperurl是每一章的首页
chapernames,chaperurls = get_chapers(url0)
for i in range(len(chaperurls)):
chapernum = chapernames[i][-3:-1]
chapername= chapernames[i]
chaperurl = chaperurls[i]
firstimgurl = get_firstimgurl(chapernum) if i
2 B站视频弹幕爬虫
如果只是想获取弹幕文件呢,用之前的代码就浪费太多时间了。
所以用B站官方的api做了一点改进。(没有查到除了b站官方api之外的其他方便的下载弹幕的方法)
效果如下:
代码如下:
import requests
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
from xml.dom.minidom import parseString
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36"}
def get_resp_text(url):
rsp = requests.get(url,headers=headers)
rsp.encoding = 'utf-8'
return rsp.text
def get_upname(mid):
global upname
rsp = requests.get('https://space.bilibili.com/'+mid)
rsp.encoding = 'utf-8'
html = rsp.text
bss = BeautifulSoup(html, 'lxml')
return (bss.find('title').text[:-len('的个人空间_哔哩哔哩_Bilibili')])
def get_bvid(mid):
i = 1
bvid = []
while i != 0:
url0 = 'https://api.bilibili.com/x/space/arc/search?mid=' + str(
mid) + '&ps=30&tid=0&pn=&keyword=&order=pubdate&jsonp=jsonp'
url0 = url0[:-len('&keyword=&order=pubdate&jsonp=jsonp')] + str(
i) + '&keyword=&order=pubdate&jsonp=jsonp'
i += 1
rsp = requests.get(url0, headers=headers)
rsp.encoding = 'utf-8'
html = rsp.text
dict = json.loads(html.replace('\n', ''))
datadict = dict['data']
listdict = datadict['list']
vlist = listdict['vlist']
if len(vlist) == 0:
i = 0
elif len(vlist) != 0:
for _ in range(len(vlist)):
bvid.insert(0, vlist[_]['bvid'])
print("bv号已经爬取完毕")
return bvid
def get_cid_url(bvid):
cid_url = []
for bid in bvid:
cid_url.insert(0,'https://api.bilibili.com/x/player/pagelist?bvid=' + str(bid) + '&jsonp=jsonp')
return cid_url
def get_cids(cid_urls):
cids = []
for cid_url in cid_urls:
str = get_resp_text(cid_url)
jsonstr = json.loads(str)
jsrdata = jsonstr['data']
jsrdict = jsrdata[0]
cids.insert(0,jsrdict['cid'])
return cids
def get_xml_url(cids):
xml_urls = []
for cid in cids:
xml_urls.insert(0,'https://api.bilibili.com/x/v1/dm/list.so?oid='+str(cid))
return xml_urls
def get_xmls(xml_urls):
xmls = []
for xml_url in xml_urls:
xmls.insert(0,get_resp_text(xml_url))
return xmls
def get_danmus(xmls):
danmus = []
for xml in xmls:
tanmus = parseString(xml).documentElement.getElementsByTagName('d')
for tanmu in tanmus:
tanmu = tanmu.childNodes[0].data
danmus.insert(0, tanmu)
print("弹幕已经爬取完毕"+'\n正在下载至本地')
return danmus
def save_danmus(upname,danmus):
with open(upname+".txt",'w',encoding='utf-8') as f:
for danmu in tqdm(danmus):
f.write(danmu+"\n")
print("共有弹幕:" + str(len(danmus)) + "条已下载至"+upname+".txt")
if __name__ =='__main__':
uid = input("请输入up主的uid:")
upname = get_upname(uid)
print("你想要查询的up主是" + upname + "吧" + "\n稍等一会儿~~")
bvid = get_bvid(uid)
cid_urls = get_cid_url(bvid)
cids = get_cids(cid_urls)
xml_urls = get_xml_url(cids)
xmls = get_xmls(xml_urls)
danmus = get_danmus(xmls)
save_danmus(upname, danmus)