scrapy安装

安装

http://www.cnblogs.com/txw1958/archive/2012/07/12/scrapy_installation_introduce.html

http://pan.baidu.com/s/1boxVtAv

文档:http://doc.scrapy.org/en/latest/topics/selectors.html

AttributeError: 'module' object has no attribute 'Spider'

版本太落后,pip install --upgrade scrapy

http://sourceforge.net/projects/pywin32/files/pywin32/Build%20219/


scrapy shell 'url'

response.xpath('//ul/li/a/@href').extract()
response.xpath('//ul/li/a/text()').extract()


运行:

scrapy crawl dmoz
response.css("ul.directory.dir-col > li > a::attr('href')"):


步骤:

scrapy startproject myproject

scrapy genspider dmoz_spider dmoz.org

 scrapy crawl myspider

scrapy list

http://www.pythontab.com/html/2014/pythonweb_0326/724.html

# Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication
import  base64 
# Start your middleware class
class  ProxyMiddleware( object ):
     # overwrite process request
     def  process_request( self , request, spider):
         # Set the location of the proxy
         request.meta[ 'proxy' =  "http://YOUR_PROXY_IP:PORT"
  
         # Use the following lines if your proxy requires authentication
         proxy_user_pass  =  "USERNAME:PASSWORD"
         # setup basic authentication for the proxy
         encoded_user_pass  =  base64.encodestring(proxy_user_pass)
         request.headers[ 'Proxy-Authorization' =  'Basic '  +  encoded_user_pass
DOWNLOADER_MIDDLEWARES  =  {
     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware' 110 ,
     'pythontab.middlewares.ProxyMiddleware' 100 ,
}

cookie:

class StackOverflowSpider(scrapy.Spider):

    name = 'stackoverflow'

    start_urls = ['http://stackoverflow.com/questions?sort=votes']

    

    def start_requests(self):

        url = "http://db.bioon.com/list.php?channelid=1016&classid=951"

        cookies = {

            'dz_username':'wst_today',

            'dz_uid':'1322052',

            'buc_key':'ofR1I78RBaCHkGp8MdBBRjMx7ustawtY',

            'buc_token':'a91b8fef55c66846d3975a9fd8883455'

        }

        return [

            scrapy.Request(url,cookies=cookies),

        ]

    

    def parse(self, response):

        ele = response.xpath(

            '//table[@class="table table-striped"]/thead/tr/th[1]/text()'

            ).extract()

        if ele:

            print "success"



def parse(self,response):

        #从response.headers中获取cookies信息

        r_headers = response.headers['Set-Cookie']

        cookies_v = r_headers.split(';')[0].split('=')

        

        cookies = {cookies_v[0]:cookies_v[1]}

        

        #模拟请求的头部信息

        headers = {

        'Host':'login.bioon.com',

        'Referer':'http://login.bioon.com/login',

        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',

        'X-Requested-With':'XMLHttpRequest' 

        }

        

        #获取验证信息

        csrf_token = response.xpath(

            '//input[@id="csrf_token"]/@value').extract()[0]

        

        #获得post的目的URL

        login_url = response.xpath(

            '//form[@id="login_form"]/@action').extract()[0]

        end_login = response.urljoin(login_url)

        

        #生成post的数据

        formdata={

        #请使用自己注册的用户名

        'account':'********',

        'client_id':'usercenter',

        'csrf_token':csrf_token,

        'grant_type':'grant_type',

        'redirect_uri':'http://login.bioon.com/userinfo',

        #请使用自己注册的用户名

        'username':'********',

        #请使用自己用户名的密码

        'password':'xxxxxxx',

        }

        

        #模拟登录请求

        return FormRequest(

        end_login,

        formdata=formdata,

        headers=headers,

        cookies=cookies,

        callback=self.after_login

        )

代理方法二:crawlera 需要收费

http://www.tuicool.com/articles/7ZnYJb2


scrapyd:

pip install scrapyd

pip install scrapyd-client

报错没有mail

sudo apt-get install python-twisted


你可能感兴趣的:(scrapy安装)