python 抓网站

前段时间写的,不过也参考了网上的例子,针对具体的情况做了修改。


#-*- coding: utf-8 -*-

import socket
import lxml.html
import lxml.etree
# import chardet

import time
from urllib import urlretrieve
from urlparse import urlparse
from os import makedirs, remove, removedirs
from os.path import exists, splitext, dirname
import sys

# from httptools import strToUnicode, unicodeToStr

class Retrieve(object):

    # def __init__(self, url, baseUrl):
    def __init__(self, url):
        self.url = url
        self.file = self.fileName()
        self.baseUrl = url
        # self.filetype = splitext(self.file)[1][1:]
        # print 'filetype: %s' % self.filetype
        # self.charset = ''

    def fileName(self):
        """根据url创建目录文件"""

        urlPart = urlparse(self.url)
        path = urlPart[1] + urlPart[2]
        if not urlPart[2]:
            path = urlPart[1] + '/'
        ext = splitext(path)
        if ext[1]=='':
            path += 'index.html'

        file = path
        path = dirname(path)

        if not exists(path):
            makedirs(path)

        return file

    def downLoad(self):
        """下载文件"""
        if exists(self.file):
            return ('** file exists',)
        socket.setdefaulttimeout(25)
        try:
            result = urlretrieve(self.url, self.file)
        except Exception, e:
            print 'download error:', e
            result = ('** invail url', )
            return result
        if ('content-length' in result[1]) and int(result[1]['content-length'])<250:
            path = dirname(self.file)
            remove(self.file)
            try:
                removedirs(path)
            except OSError:
                pass
            result = ('** invail url', )
        return result


    def getLinks(self):
        """获取文件中的链接"""
        f = open(self.file)
        html = f.read()
        f.close()

        # 编码判断及转换
        # charJust = chardet.detect(html)
        # try:
            # if charJust['encoding'].lower() == 'gb2312':
                # charJust['encoding'] = 'gb18030'
        # except Exception, e:
            # charJust['encoding'] = 'utf-8'
        # self.charset = charJust['encoding']
        # html = strToUnicode(html, encoding=self.charset)

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(base_url=self.baseUrl, resolve_base_href=False)

        linkList = []

        for link in lxml.html.iterlinks(doc):
            linkList.append(link[2])

        # 把绝对路径替换成相对路径
        # self.linkReplFunc(doc)
           
        return linkList

class Crawl(object):
   
    def __init__(self, url, domain):
        self.url = url
        self.domain = domain
        self.seen = []
        self.vlink = [url]
        self.baseUrl = url

    def getPage(self):
        # rv = Retrieve(self.url, self.baseUrl)
        rv = Retrieve(self.url)
        # 把下载过得url放入列表中
        self.seen.append(self.url)
        result = rv.downLoad()


        if result[0] == '** invail url':
            self.log('** download err')
            return

        if self.url[-3:].lower() in ['css', '.js', 'jpg', 'png', 'gif', 'bmp', 'mp4', 'exe', 'bin', 'swf', 'ico']:
            return

        if self.url.find('sina')!=-1:
            self.log('** sina url')
            return

        try:
            links = rv.getLinks()
        except Exception,e:
            print 'getLinks error:', e
            return

        self.log('sucess download')
        for link in links:
            self.log('get link', link)
            link = link.split('?')[0].split('#')[0]
            if (link not in self.seen) and (link not in self.vlink) and (self.domain in link) and link.count(r'://')<2 and link.find(r'http://uctest.ucweb.com:81')==0:
                self.log('++app', link)
                print 'appendlink: %s' % link
                self.vlink.append(link)
            else:
                self.log('--drop', link)

    def go(self):
        while self.vlink:
#            time.sleep(2)
            link = self.vlink.pop()
            self.url = link
#            print 'download list: ', self.vlink
            print 'download: ', self.url
            self.getPage()
            # sys.exit(0)

    def log(self, st, link=''):
        f=file('urlall.txt', 'a')
        if len(link)==0:
            f.write((u'%s:\t%s\n' % (st, self.url)).encode("utf-8"))
        else:
            f.write((u'%s:\t%s\n' % (st, link)).encode("utf-8"))
        f.close()

if __name__ == '__main__':

    #url = "http://www.phpv.net/topics/79.html"
    #url = 'http://uctest.ucweb.com:81/wml/Graphics/htmlcachepic/1p_11.html'
    url = "http://uctest.ucweb.com:81/wml/index.wml"
    domain = 'uctest.ucweb.com'
    cr = Crawl(url, domain)
    cr.go()
    print 'download over'


你可能感兴趣的:(python 抓网站)