网络爬虫:抓取XXOO图片

基本程序

# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import os
from bs4 import BeautifulSoup
import re

def url_open(url):

    req=urllib.request.Request(url)
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
    response=urllib.request.urlopen(req)
    html=response.read()

    return html

def get_pagenum(url): #获取jandan网站的页面号(2320)
    html = url_open(url).decode("utf-8")
    num_re = re.compile(r'\[\d{4}\]')
    num = num_re.search(html)
    a = re.compile(r'\d{4}')
    num_b = a.search(num.group())
    return  num_b.group()


def get_images(url):

    # html=url_open(url).decode("utf-8")
    # img_list=[]
    # jpg_re=re.compile(r'# numurl=jpg_re.findall(html)
    # print (numurl)
    # jpg = re.compile(r'//w.+\.jpg')
    # for line in numurl:
    #     imgurl=jpg.findall(line)
    #     img_list.append(imgurl[0])
    # return img_list

    html=url_open(url).decode("utf-8")
    img_list=[]
    jpg_re=re.compile(r')
    #当给出的正则表达式中带有一个括号时,列表的元素为字符串,
    #此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)
    imgurl=jpg_re.findall(html)
    for each in imgurl:
        img_list.append(each)

    # print (img_list)
    return img_list


def save_imgs(img_list):
    i=0
    for each in img_list:
        i+=1
        filename=each.split("/")[-1]
        with open(filename,"wb") as f:
            img=url_open("http://%s" %each)
            f.write(img)
            print ("下载本页的第%s张图片,名称为%s" %(i,filename))

def download__mm(dir,url):
    if not os.path.isdir(dir):
        os.mkdir(dir)
        os.chdir(dir)
    else:
        os.chdir(dir)

    url=url
    page_num=int(get_pagenum(url))
    for i in range(20):
        page_num -= 1
        pageurl = url + "page-" + str(page_num) + "#comments"
        imgurl = get_images(pageurl)
        print("下载第%s页图片" % page_num)
        saveimg = save_imgs(imgurl)

if __name__=="__main__":

    dir="PaPa"
    url= "http://jandan.net/ooxx/"
    download__mm(dir,url)

增加代理

**但是存在的问题是,使用代理后urllib.error.URLError
[WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>

# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import os
from bs4 import BeautifulSoup
import re
import random

proxies = []
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}

def get_proxy():
    url="http://www.xicidaili.com"
    req=urllib.request.Request(url,headers=headers)
    # req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) "
    #               "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8")
    # IP=re.compile(r"\b(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])\b")
    IP = re.compile('(\d+)\.(\d+)\.(\d+)\.(\d+)\s*(\d+)')
    proxy_ip=IP.findall(html)
    for each in proxy_ip:
        proxies.append(":".join([(".".join(each[0:4])),each[4]]))
    # print (proxies)
    return proxies


def change_proxy():
    proxy=random.choice(proxies)
    if proxy==None:
        proxy_support=urllib.request.ProxyHandler({})
    else:
        proxy_support = urllib.request.ProxyHandler({"http": proxy})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders=[("User-Agent",headers["User-Agent"])]
    urllib.request.install_opener(opener)
    print('智能切换代理:%s' % ('本机' if proxy == None else proxy))



def url_open(url):

    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req)
    html=response.read()

    return html

def get_pagenum(url): #获取jandan网站的页面号(2320)
    html = url_open(url).decode("utf-8")
    num_re = re.compile(r'\[\d{4}\]')
    num = num_re.search(html)
    a = re.compile(r'\d{4}')
    num_b = a.search(num.group())
    return  num_b.group()


def get_images(url):

    html=url_open(url).decode("utf-8")
    img_list=[]
    jpg_re=re.compile(r')
    #当给出的正则表达式中带有一个括号时,列表的元素为字符串,
    #此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)
    imgurl=jpg_re.findall(html)
    for each in imgurl:
        img_list.append(each)

    # print (img_list)
    return img_list


def save_imgs(img_list):
    i=0
    for each in img_list:
        i+=1
        filename=each.split("/")[-1]
        with open(filename,"wb") as f:
            img=url_open("http://%s" %each)
            f.write(img)
            print ("下载本页的第%s张图片,名称为%s" %(i,filename))

def download__mm(dir,url):
    if not os.path.isdir(dir):
        os.mkdir(dir)
        os.chdir(dir)
    else:
        os.chdir(dir)

    url=url
    page_num=int(get_pagenum(url))
    for i in range(20):
        page_num -= 1
        pageurl = url + "page-" + str(page_num) + "#comments"
        imgurl = get_images(pageurl)
        print("下载第%s页图片" % page_num)
        saveimg = save_imgs(imgurl)

if __name__=="__main__":
    get_proxy()
    change_proxy()
    dir="PaPa"
    url= "http://jandan.net/ooxx/"
    download__mm(dir,url)

你可能感兴趣的:(python)