失败的爬虫

import scrapy

class Dmozspoder(scrapy.Spider):
name = "dmoz"
allowed_domains = ['umei.fun/']
start_urls = [
'http://umei.fun/posts/4212'
,'http://umei.fun/posts/4211'
]
def parse(self,response):
filename = response.url.split("/")[-2]
with open(filename,'wb')as f:
f.write(response.body)

scrapy crawl dmoz

失败
————————————————————————————————————————————
import urllib.request
import os

def url_open(ur1):
req = urllib.request.Request(ur1)
req.add_header('Uset-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36')
response = urllib.request.urlopen(ur1)
html = response.read()
print(url)
return html

def get_page(ur1):

html = url_open(ur1).decode('utf-8')
a = html.find('main-image')+13
b = html.find(']',a)
return html[a:b]

def find_imgs(ur1):

html = url_open(ur1).decode('utf-8')
img_addrs=[]
a = html.find('img src=')

while a != -1: 
   b = html.find('.jpg',a,a + 255)#找到jpg
   if b != -1:
       img_addrs.append(html[a+9:b+4])

   else:
     b = a + 9
    
   a = html.find('img src=',b)
for each in img_addrs:  
   print(each)

def save_imgs(folder,img_addrs):
pass

def download_mm(folder='xxoo',pages=10):#创建文件.前10行

os.mkdir(folder)
os.chdir(folder)
url = 'https://www.mzitu.com'
page_num = int(get_page(url))

for i in range(pages):
    page_num -= i
    page_url = url + 'page-' + str(page_num)+'#'
    img_addrs = find_imgs(page_url)
    save_imgs(folder,img_addrs)

if name == 'main':
download_mm()

你可能感兴趣的:(失败的爬虫)