前段时间写的,不过也参考了网上的例子,针对具体的情况做了修改。
#-*- coding: utf-8 -*-
import socket
import lxml.html
import lxml.etree
# import chardet
import time
from urllib import urlretrieve
from urlparse import urlparse
from os import makedirs, remove, removedirs
from os.path import exists, splitext, dirname
import sys
# from httptools import strToUnicode, unicodeToStr
class Retrieve(object):
# def __init__(self, url, baseUrl):
def __init__(self, url):
self.url = url
self.file = self.fileName()
self.baseUrl = url
# self.filetype = splitext(self.file)[1][1:]
# print 'filetype: %s' % self.filetype
# self.charset = ''
def fileName(self):
"""根据url创建目录文件"""
urlPart = urlparse(self.url)
path = urlPart[1] + urlPart[2]
if not urlPart[2]:
path = urlPart[1] + '/'
ext = splitext(path)
if ext[1]=='':
path += 'index.html'
file = path
path = dirname(path)
if not exists(path):
makedirs(path)
return file
def downLoad(self):
"""下载文件"""
if exists(self.file):
return ('** file exists',)
socket.setdefaulttimeout(25)
try:
result = urlretrieve(self.url, self.file)
except Exception, e:
print 'download error:', e
result = ('** invail url', )
return result
if ('content-length' in result[1]) and int(result[1]['content-length'])<250:
path = dirname(self.file)
remove(self.file)
try:
removedirs(path)
except OSError:
pass
result = ('** invail url', )
return result
def getLinks(self):
"""获取文件中的链接"""
f = open(self.file)
html = f.read()
f.close()
# 编码判断及转换
# charJust = chardet.detect(html)
# try:
# if charJust['encoding'].lower() == 'gb2312':
# charJust['encoding'] = 'gb18030'
# except Exception, e:
# charJust['encoding'] = 'utf-8'
# self.charset = charJust['encoding']
# html = strToUnicode(html, encoding=self.charset)
doc = lxml.html.fromstring(html)
doc.make_links_absolute(base_url=self.baseUrl, resolve_base_href=False)
linkList = []
for link in lxml.html.iterlinks(doc):
linkList.append(link[2])
# 把绝对路径替换成相对路径
# self.linkReplFunc(doc)
return linkList
class Crawl(object):
def __init__(self, url, domain):
self.url = url
self.domain = domain
self.seen = []
self.vlink = [url]
self.baseUrl = url
def getPage(self):
# rv = Retrieve(self.url, self.baseUrl)
rv = Retrieve(self.url)
# 把下载过得url放入列表中
self.seen.append(self.url)
result = rv.downLoad()
if result[0] == '** invail url':
self.log('** download err')
return
if self.url[-3:].lower() in ['css', '.js', 'jpg', 'png', 'gif', 'bmp', 'mp4', 'exe', 'bin', 'swf', 'ico']:
return
if self.url.find('sina')!=-1:
self.log('** sina url')
return
try:
links = rv.getLinks()
except Exception,e:
print 'getLinks error:', e
return
self.log('sucess download')
for link in links:
self.log('get link', link)
link = link.split('?')[0].split('#')[0]
if (link not in self.seen) and (link not in self.vlink) and (self.domain in link) and link.count(r'://')<2 and link.find(r'http://uctest.ucweb.com:81')==0:
self.log('++app', link)
print 'appendlink: %s' % link
self.vlink.append(link)
else:
self.log('--drop', link)
def go(self):
while self.vlink:
# time.sleep(2)
link = self.vlink.pop()
self.url = link
# print 'download list: ', self.vlink
print 'download: ', self.url
self.getPage()
# sys.exit(0)
def log(self, st, link=''):
f=file('urlall.txt', 'a')
if len(link)==0:
f.write((u'%s:\t%s\n' % (st, self.url)).encode("utf-8"))
else:
f.write((u'%s:\t%s\n' % (st, link)).encode("utf-8"))
f.close()
if __name__ == '__main__':
#url = "http://www.phpv.net/topics/79.html"
#url = 'http://uctest.ucweb.com:81/wml/Graphics/htmlcachepic/1p_11.html'
url = "http://uctest.ucweb.com:81/wml/index.wml"
domain = 'uctest.ucweb.com'
cr = Crawl(url, domain)
cr.go()
print 'download over'