为了采集到大量图片进行图片分类的迁移学习,简单的学习下python爬虫技术,方便采集到大量图片提高效率
1.获取整个页面数据
#coding=utf-8
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
html = getHtml("http://tieba.baidu.com/p/2738151262")
print html
2.筛选页面中想要的数据
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return htmldef getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
html = getHtml("http://tieba.baidu.com/p/2460150866")
print getImg(html)
3.将页面筛选的数据保存到本地
#coding=utf-8
import urllib
import re
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return htmldef getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg'% x)
x+=1
html = getHtml("http://tieba.baidu.com/p/2460150866")
print getImg(html)
详细解析看此链接
4.在简单学习python爬虫之后,自己写的小例子
#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8
import urllib2
import urllib
import os
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
def getAllImageLink():
html = urllib2.urlopen('http://www.xiaohuar.com').read()
'''
fromEncoding参数说明
参数指定为:gb18030
解决问题:出现中文乱码的现象
'''
soup = BeautifulSoup(html,fromEncoding="gb18030")
liResult = soup.findAll('li')
x=0
for li in liResult:
imageEntityArray = li.findAll('img')
if len(imageEntityArray):
for image in imageEntityArray:
link = image.get('lazysrc')
imageName = image.get('alt')
obj = urlparse(link)
scheme = obj.scheme
# 筛选图片并保存
if link and imageName and scheme=='http' and obj.hostname=='www.xiaohuar.com':
print link
print imageName
x+=1
urllib.urlretrieve(link,'%s.jpg' % imageName)
if __name__ == '__main__':
getAllImageLink()
参考链接
解决BeautifulSoup中文乱码