根据示例代码学习爬取漫画(一)

#学习爬取漫画

  • [ 1]使用原代码成功下载漫画
  • [ 2] 解决难点

原代码

import urllib.parse

import requests
import re
import json
import execjs
from bs4 import BeautifulSoup
from pymongo import MongoClient
import os

conn = MongoClient('localhost',27017)
db = conn['comics']

def get_comic(comic_name):
    param = {
     's':comic_name}

    url = 'https://sacg.dmzj.com/comicsum/search.php?' + urlencode(param)

    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4315.5 Safari/537.36',
        'Referer': 'https://manhua.dmzj.com/tags/search.shtml?s=%E5%9B%9B%E6%9C%88%E6%98%AF%E4%BD%A0%E7%9A%84%E8%B0%8E%E8%A8%80',
        'Host': 'sacg.dmzj.com',
    }
    # 提取存有图片url信息的js代码
    response = requests.get(url ,headers=headers)
    str = re.search("\[(.*?)\];",response.text).group(1)
    str = re.search("({.*?})",str).group(1)
    comic_url = 'https:' + json.loads(str)['comic_url_raw']
    return comic_url

def get_chapter(comic_url):
    response = requests.get(comic_url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    srcs = soup.find('div',class_="cartoon_online_border").find_all('a')
    chapters_url = []
    for a in srcs:
        chapters_url.append(a.get('href'))
    return chapters_url

# 运行存有图片url的js代码
def get_chapter_imgs_url(chapter_url):
    response = requests.get(chapter_url)
    js_str = re.search('eval\((.*?)\)\n', response.text).group(1)
    js_str = js_str.replace('function(p,a,c,k,e,d)', 'function fun(p, a, c, k, e, d)')

    fun = """
             function run(){
                    var result = %s;
                    return result;
                }
        """ % js_str
    pages = execjs.compile(fun).call('run')
    data = pages.split('=')[2][1:-2]
    url_list = json.JSONDecoder().decode(data)
    for i in range(0, len(url_list)):
        url_list[i] = 'https://images.dmzj.com/' + url_list[i]
    # url_list = dict(zip(range(1,len(url_list)+1),url_list))
    return url_list



def save_to_mongo(comic_name,list):
    if(db[comic_name].insert(list)):
        print('存入Mongo成功')
        return True
    return  False

def save_images(content, comic_name, chap, imgNum):
    root_path = 'D:\\comic\\' + comic_name
    if not os.path.exists(root_path):
        os.mkdir(root_path)
    chapter_path = root_path + '\第' + str(chap) + '话\\'
    if not os.path.exists(chapter_path):
        os.mkdir(chapter_path)
    path = chapter_path + str(imgNum) + '.jpg'
    if not os.path.exists(path):
        with open(path, 'wb') as f:
            f.write(content)
            f.close()
            print(path + '下载完成')

def download_images(comic_name):
    table = db[comic_name]
    # 破解图片防盗链(低级版),加referer即可
    headers = {
     
        'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Upgrade - Insecure - Requests': '1',
        'Referer': 'https://www.baidu.com/link?url=g3QbzSNbZietJQ_Rf4wcjn8RDipbM5wWtRYwvndTU64RtUj0yIVYBz75dHfoLnu9&wd=&eqid=ca0acac40002851a000000065bb84b15'
    }
    for src in table.find({
     }, {
     "_id": 0}):
        i = 0
        for url in src['img']:
            i+=1
            try:
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    save_images(response.content, comic_name, src['chapter'], i)
                elif response.status_code == 403:
                    print('403 Forbidden ')
            except Exception as e:
                print('请求图片出错', url)
                print('error ', e)


def main():
    comic_name = '四月是你的谎言'
    # comic_url = get_comic(comic_name)
    # chapters_url = get_chapter(comic_url)
    list = {
     }
    # for i in range(0, len(chapters_url)):
    #     chapters_url[i] = 'https://manhua.dmzj.com' + chapters_url[i]
    #     list['_id'] = i + 1
    #     list['chapter'] = (i + 1)
    #     list['img'] = (get_chapter_imgs_url(chapters_url[i]))
    #     save_to_mongo(comic_name,list)
    download_images(comic_name)


if __name__ == '__main__':
    main()

def main():
    comic_name = '四月是你的谎言'
    comic_url = get_comic(comic_name)
    chapters_url = get_chapter(comic_url)
    list = {
     }
    for i in range(0, len(chapters_url)):
        chapters_url[i] = 'https://manhua.dmzj.com' + chapters_url[i]
        list['_id'] = i + 1
        list['chapter'] = (i + 1)
        list['img'] = (get_chapter_imgs_url(chapters_url[i]))
        save_to_mongo(comic_name,list)
    #download_imgs(list,comic_name)


if __name__ == '__main__':
    main()

难点1-----pymongo

  • conn = MongoClient(‘localhost’,27017)----建立无用户名密码连接---------->即pyMongo的使用,尝试过学习pysql,但是存储图片的过程略复杂,似乎学习pymongo更直接高效一点

pymongo

涉及代码:

from pymongo import MongoClient
conn = MongoClient('localhost',27017)#连接数据库
db = conn['comics']##选择集合(mongo中collection和database都是lazy创建的,具体可以google下)
def save_to_mongo(comic_name,list):
    if(db[comic_name].insert(list)):
        print('存入Mongo成功')
        return True
    return  False

意思大概理解,暂不深究

直接运行

报错: 由于目标计算机积极拒绝,无法连接

`暂时无法解决,搜寻无果,应该是服务器判断出爬虫云云,想短时间在这里学习更多的话深究此问题无用
尝试自行拆解代码分步运行

第一部分

def get_comic(comic_name):
    param = {
     's':comic_name}
    url = 'https://sacg.dmzj.com/comicsum/search.php?' + urlencode(param)#这里的url指向存储图片连接的json文件是提前找好的,直接拿来用省的自己去network找
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4315.5 Safari/537.36',
        'Referer': 'https://manhua.dmzj.com/tags/search.shtml?s=%E5%9B%9B%E6%9C%88%E6%98%AF%E4%BD%A0%E7%9A%84%E8%B0%8E%E8%A8%80',
        'Host': 'sacg.dmzj.com',
    }#换了头也没用
    # 提取存有图片url信息的js代码
    response = requests.get(url ,headers=headers)
    str = re.search("\[(.*?)\];",response.text).group(1)#正则的分片
    str = re.search("({.*?})",str).group(1)
    comic_url = 'https:' + json.loads(str)['comic_url_raw']#这边错了,不是这个名字
    return comic_url

第二步

def get_chapter(comic_url):#接上一步的url,这个url即编译后可用的连接
    response = requests.get(comic_url)
    html = response.text
    soup = Beauti斜体样式fulSoup(html, 'lxml')
    srcs = soup.find('div',class_="cartoon_online_border").find_all('a')
    chapters_url = []
    for a in srcs:
        chapters_url.append(a.get('href'))
    return chapters_url

这段代码没用,不能达到获取章节名的效果,comic_url的内容没有章节名

第三步

def get_chapter_imgs_url(chapter_url):
    response = requests.get(chapter_url)
    js_str = re.search('eval\((.*?)\)\n', response.text).group(1)
    js_str = js_str.replace('function(p,a,c,k,e,d)', 'function fun(p, a, c, k, e, d)')
    fun = """
             function run(){
                    var result = %s;
                    return result;
                }
        """ % js_str#这里替换了json中的内容
    pages = execjs.compile(fun).call('run')
    data = pages.split('=')[2][1:-2]
    url_list = json.JSONDecoder().decode(data)
    for i in range(0, len(url_list)):
        url_list[i] = 'https://images.dmzj.com/' + url_list[i]
    # url_list = dict(zip(range(1,len(url_list)+1),url_list))
    return url_list

余下部分
到此为止,所花时间和收获都小于预期,换一个实例进行学习,计划找一个使用selenium的

  • [ 1] 至少熟练掌握了根据某漫画图片url通过循环获取批量图片的技巧
  • [ 2]希望学会数据库存储图片,目前仅了解数据库mysql基础知识,仅知道创建db,表,往表中插入值都不熟练。距存储漫画目标有距离
  • [ 3]仅仅会批量爬取图片的简单操作是不够的,爬取时应该添加章节名、文件名、下载提示的细节(也就是多编写函数,多熟练提取内容),最终目标是用户输入漫画名称即可实现下载至指定文件!
  • [ 4]解析网页目前只熟练bs4,需要结合正则
  • [ 5]应当重视selenium的熟练使用

你可能感兴趣的:(爬虫,python)