python3爬虫简单记录-使用CSS选择器

学习过了python写爬虫,怕以后完全不记得了,故简单把写的东西贴一下。
如果以后继续深入,再将这些代码功能补充完全。
CSS选择器,需要安装cssselect模块,用pip命令就可以,还需要安装lxml模块
还可以使用selenium简单模拟浏览器,或者使用PyQt4/PySide模拟浏览器动作
还有Scrapy爬虫框架。
另外,可能用到图像处理,PIL模块或者Pillow模块解决简单验证码的问题;cookielib模块解决cookies问题

使用的windows+python3.5.2

文件downloading.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-

'''downloading.py
下载静态网页的功能组合成的模块
有一个下载器类class Downloader
一个下载限速类class Throttle

默认值:
DEFAULT_AGENT = 'wswp'  --用户代理
DEFAULT_DELAY = 5       --延时5s
DEFAULT_RETRIES = 1     --重试下载次数
DEFAULT_TIMEOUT = 60    --超时时限

---'''


from urllib import request, error, parse
from datetime import datetime
import random
import time
import socket

DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 60


class Downloader(object):
    """下载页面的类
                属性:self.throttle = Throttle(delay)
        self.user_agent = user_agent    --用户代理
        self.proxies = proxies          --下载协议,默认为None
        self.num_retries = num_retries  --重试下载次数
        self.opener = opener            --request启动器,默认为None
        self.cache = cache              --下载缓存,默认关闭(None),需要自己定义cache类
        类方法:download()和特殊类方法__call__()
        如果不需要缓存和限速功能,直接调用download(),不用类实例就不会通过__call__()方法

    """

    def __init__(self, delay=DEFAULT_DELAY, timeout=DEFAULT_TIMEOUT,
                 user_agent=DEFAULT_AGENT, num_retries=DEFAULT_RETRIES,
                 proxies=None, opener=None, cache=None):
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.opener = opener
        self.cache = cache

    def __call__(self, url):
        """
        类的特殊方法,在对象作为函数被调用时会调用该方法
        传入一个url,使用默认的下载参数,返回下载的html的bytes对象
        该方法,实现了下载前检查缓存和限速5s的功能
        """
        result = None
        if self.cache:  # 检查缓存是否定义
            try:
                result = self.cache[url]
            except KeyError:
                # url is not available in cache
                pass
            else:  # 如果已经缓存该url,检查之前的下载是否遇到服务端错误
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    # server error so ignore result from cache and re-download
                    result = None
        if result is None:
            # 如果实际没有缓存,则下载该url,然后添加缓存
            # result was not loaded from cache so still need to download
            self.throttle.wait(url)  # 下载延时,默认5s
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(
                url, headers, proxy=proxy, num_retries=self.num_retries)
            if self.cache:
                # save result to cache
                self.cache[url] = result
        return result['html']

    def download(self, url, headers, num_retries, proxy, data=None):
        """
        参数:url, headers, proxy, num_retries, data=None
        下载该url,返回HTTP状态码和html组成的字典{'html': html, 'code': code}
        """
        print('Downloading:', url)
        requ = request.Request(url, data, headers or {})
        # request对象启动器,用于支持代理,GET/POST协议和下载页面,同时能返回HTTP状态码
        opener = self.opener or request.build_opener()
        if proxy:
            proxy_params = {parse.urlparse(url).scheme: proxy}
            opener.add_handler(request.ProxyHandler(proxy_params))
        try:
            response = opener.open(requ)
            html = response.read()
            code = response.code
        except error.URLError as e:
            print('Download error:', str(e))
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return self._get(url,
                                     headers, proxy, num_retries - 1, data)
            else:
                code = None
        return {'html': html, 'code': code}


class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    下载限速,在2次下载之间添加延时
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        """Delay if have accessed this domain recently
        """
        domain = parse.urlsplit(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()

ajax_crawler.py


import lxml.html
import urllib
from downloading import Downloader


def main():

    D = Downloader()
    #这个地方有编码问题
    parameters = {'fname': '\xc1\xa2\xd6\xbe\xbd\xf0\xb6\xee', 'lname': '呵呵'}
    data = urllib.parse.urlencode(parameters)
    print(type(data))

    # Downloader.download(self, url, headers, num_retries, proxy, data=None)
    htmldict = D.download(
        'http://www.w3school.com.cn/ajax/demo_post2.asp',
        {'User-agent': ' seagent'}, 1, None, data=data.encode('gb2312'))
    html = htmldict['html']
    print(html)
    # import chardet
    # print(chardet.detect(html))
    html = html.decode('gb2312')
    print(html)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('p')[0]

    print(td.text_content())


def main1():
    D = Downloader()
    #分析网页ajax地址
    url = 'http://www.w3school.com.cn/tiy/loadtext.asp?f=ajax_async_false'
    html = D(url)
    print(html.decode('gb2312'))



if __name__ == '__main__':
    main1()

你可能感兴趣的:(Python)