本人并不擅长造轮子- -借用这位网友的代码进行改进
http://blog.csdn.net/xiligey1/article/details/73321152
需要使用的库有request nude url lib threading等
主要需要注意一下nude,我用pypy安装莫名失败,只能用python2运行这段代码- –
首先是网友的源代码:
# coding=utf-8
"""根据搜索词下载百度图片"""
import re
import sys
import urllib
import requests
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl).text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
fanye_urls = re.findall(re.compile(r'下一页'), html, flags=0)
fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
return pic_urls, fanye_url
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
if __name__ == '__main__':
keyword = '苍老师' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_first + urllib.quote(keyword, safe='/')
all_pic_urls = []
onepage_urls, fanye_url = get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
fanye_count = 0 # 累计翻页数
while 1:
onepage_urls, fanye_url = get_onepage_urls(fanye_url)
fanye_count += 1
print('第%s页' % fanye_count)
if fanye_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
down_pic(list(set(all_pic_urls)))
但是这样的话速度太慢了,几千张图片一个个下载
所以我用了多线程,又怕崩溃卡死,我用的四个线程
你们也可以根据实际情况更改一波
更改后:
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl).text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
fanye_urls = re.findall(re.compile(r'下一页'), html, flags=0)
fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
return pic_urls, fanye_url
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
bigest=0
wrong=0
for i, pic_url in pic_urls:
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
wrong=wrong+1
continue
if I>bigest:
bigest=i
if __name__ == '__main__':
keyword = '美女' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_first + urllib.quote(keyword, safe='/')
all_pic_urls = []
onepage_urls, fanye_url = get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
fanye_count = 0 # 累计翻页数
while 1:
onepage_urls, fanye_url = get_onepage_urls(fanye_url)
fanye_count += 1
print('第%s页' % fanye_count)
if fanye_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
The_list=list(set(all_pic_urls))
x=1
a=[]
for i in The_list:
word=[x,i]
x=x+1
a.append(word)
The_list=a
all_num=The_list[-1][0]
num=int(The_list[-1][0])//4
print "共"+str(all_num)+"张图片"
list1=The_list[0:num]
list2=The_list[num:2*num]
list3=The_list[2*num:3*num]
list4=The_list[3*num:-1]
threads = []
t1 = threading.Thread(target=down_pic, args=(list1,))
threads.append(t1)
t2 = threading.Thread(target=down_pic, args=(list2,))
threads.append(t2)
t3 = threading.Thread(target=down_pic, args=(list3,))
threads.append(t3)
t4 = threading.Thread(target=down_pic, args=(list4,))
threads.append(t4)
for t in threads:
t.setDaemon(True)
t.start()
for t in threads:
t.join()
print "over"
global wrong
print "共"+str(wrong)+"张下载失败"
#down_pic(list(set(all_pic_urls)))
有的地方改的不好
- –
这样的话
本地就有了1.jpg-2000.jpg(按2000张)
接下来要筛选
import nude,os,threading
介绍一下nude(https://github.com/hhatto/nude.py)
这是裸体检测的一个库,原理就是找人体肤色区域然后分析一下(日本小哥开发)
然后嘞
我们可以反过来用
如果监测到是非法图片- – 就留下,否则用os.remove删除
def nude_yesorno(num):
try:
res=nude.is_nude(str(num)+'.jpg')
return res
except:
return False
def panduan(first,bigest):
for i in range(first,bigest):
res=nude_yesorno(i)
if res==False:
try:
os.remove(str(i)+".jpg")
except:
pass
print i,res
可是如果两千张,一张张也很慢
继续引用多线程
def thread_panduan(biggest):
a_5=biggest//5
threads = []
t1 = threading.Thread(target=panduan, args=(0,a_5))
threads.append(t1)
t2 = threading.Thread(target=panduan, args=(a_5, 2*a_5))
threads.append(t2)
t3 = threading.Thread(target=panduan, args=(2*a_5, 3 * a_5))
threads.append(t3)
t4 = threading.Thread(target=panduan, args=(3*a_5, 4 * a_5))
threads.append(t4)
for t in threads:
t.join()
print "over"
OK[图片上传失败...(image-6adb41-1525609580232)]
为了稳定性,我把两个文件分开,如果同时下载并分析容错应该就比较差了吧- –
这个应该也可以用来筛查视频,截取帧,判断,以后有时间再写
GitHub: https://github.com/Muxxs/pic_nude
blog: http://muxxs.com
欢迎交流