python采集整站图片

记录学习Python历程

简单了解语法规则,尝试利用正则采集网站图片,结果还算理想,不过只写了手动查询的,自动采集加俩循环搞定,只是语法储备不足,还是有很多冗余代码,继续学习!!!继续撸起袖子加油干,奥利给!!!

import urllib.request
import re
import urllib
#函数返回值:cod 1为失败 0为成功

#函数返回值格式
def return_msg(cod,msg,data):
    res = {}
    res['cod'] = cod
    res['msg'] = msg
    res['data'] = data
    return res

#获取网页源码
def get_html(url):
    page = urllib.request.urlopen(url)
    html_a = page.read()
    return html_a.decode('utf-8')

#正则提取数组操作
def re_ac(html,reg):
    re_res = re.compile(reg)
    arr = re_res.findall(html)
    res = return_msg(1,"匹配失败",'')
    if len(arr) !=0:
        res = return_msg(0, "匹配成功", arr)
    return res

#正则提取数组操作 允许换行
def re_ac_tall(html,reg):
    re_res = re.compile(reg,re.DOTALL)
    arr = re_res.findall(html)
    res = return_msg(1,"匹配失败",'')
    if len(arr) != 0:
        res = return_msg(0, "匹配成功", arr)
    return res

#获取子页面地址
def get_son_url(html):
    urllist = re_ac(html,r'(.*?)
') res = urls if urls['cod'] == 0: nexturl = urls['data'] if nexturl != '': if '下一页' in nexturl: url = re_ac(nexturl,r '') res = url if url['cod'] == 1: res = return_msg(2, "没有下一页了", '') else: res = return_msg(2, "没有下一页了", '') else: res = return_msg(1, "数据匹配失败", '') return res # 获取分类 def get_class(html): info = re_ac_tall(html,r'') res = info if info['cod'] == 0: infos = re_ac(info['data'][0], r'href="(.*?)" >(.*?)<') if infos['cod'] == 0: res = infos else: res = return_msg(1, "数据匹配失败", '') else: res = return_msg(1, "数据匹配失败", '') return res host = "https://www."+"wo"+"yao"+"ge"+"xing"+".com" #读取图片 html_b = get_html(host+"/touxiang/") if html_b != '': n_class = get_class(html_b) n_class_data = n_class['data'] if n_class['cod'] == 0: while True: i = 0 print("PS:请根据序号查询分类图片") for v in n_class['data']: print(str(i) + '---' + str(v[1])) i += 1 index = input("请输入查询序号:") if index == '-1': print("已退出查询") break elif int(index) > len(n_class['data']) or int(index) < 0: print("请输入正确的序号") else: page = input("请输入查询页码:") while int(page) <= 0: print("请输入大于等于1的页码") page = input("请输入查询页码:") print(host + n_class['data'][int(index)][0] + "index_" + page + ".html") class_url = host + n_class['data'][int(index)][0] if int(page) > 1: class_url = host + n_class['data'][int(index)][0] + "/index_" + page + ".html" html_c = get_html(class_url) urllist = get_son_url(html_c) if urllist['cod'] == 0: for url in urllist['data']: html = get_html(host + url) imgres = get_son_img(html) if imgres['cod'] == 0: print(imgres['data']) else: break print(imgres) else: break print(urllist) else: print("分类获取失败") else: print("数据查询失败")

你可能感兴趣的:(python采集整站图片)