Scrapy学习笔记(二)

抓去Mp4ba电影信息

0x00 创建项目和Spider

scrapy startproject movieproject
scrapy genspider -t crawl mp4ba mp4ba.com

大致结构

├── movieproject
│   ├── agents.py
│   ├── __init__.py
│   ├── items.py
│   ├── log.py
│   ├── middlewares.py
│   ├── mysql.py
│   ├── pipelines.py
│   ├── proxy.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       └── mp4ba.py
└── scrapy.cfg

mp4ba.py中的代码

# -*- coding: utf-8 -*-

class Mp4baSpider(CrawlSpider):
    name = "mp4ba"
    allowed_domains = ["mp4ba.com"]
    start_urls = (
        u'http://www.mp4ba.com/',
    )

    rules = (
        Rule(LinkExtractor(allow=r'\index.php\?(.)*page=(\d)*'), callback='parse_page', follow=True),

        Rule(LinkExtractor(allow=r'show.php\?hash=(.)*'), callback='parse_detail', follow=False),
    )

    def parse_page(self, response):
        items = []
        for  data in response.xpath("//tbody[@id='data_list']/tr[@class='alt1']"):
            item = MovieItem()
            item['publish_time'] =  u''.join(data.xpath("td[1]/text()").extract())
            item['category'] = u''.join(data.xpath("td[2]/a[@href]/text()").extract())
            item['name'] = u''.join(data.xpath("td[3]/a[@href]/text()").extract()).strip()
            item['size'] = u''.join(data.xpath("td[4]/text()").extract())
            item['download_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='btl_1']/text()").extract())
            item['detail_link'] = u''.join(data.xpath("td[3]/a[@href]/@href").extract())
            item['torrert_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='bts_1']/text()").extract())
            items.append(item)
        return items
        pass

    def parse_detail(self, response):
        item = MovieDetailItem()
        item['detail_link'] = response.url
        item['download_link'] = response.xpath("//p[@class='original download']/a[@id='download']/@href").extract()[0]
        item['magnet_link'] = response.xpath("//p[@class='original magnet']/a[@id='magnet']/@href").extract()[0]
        return item
        pass

pipelines.py

class MovieSavePipeline(object):
    def __init__(self):
        self.linecount = 0
        #创建一个数据库处理对象
        self.db = Mp4BaSql()

    def process_item(self, item, spider):
        self.linecount  = self.linecount +1

        if type(item) is MovieItem:
            print "%d.\t%s" %(self.linecount,item['name'])
            self.db.insert('tbMovieItem',item)
        elif type(item) is MovieDetailItem:
            print item['download_link']
            self.db.insert('tbMovieDetailItem',item)
        else:   
            pass
        return item

mysql.py

#!/usr/bin/python
#coding=utf-8
#

import pymysql
from movieproject.items import MovieItem,MovieDetailItem

class SqlBase(object):
    def __init__(self):

        self.dbName= self.database_Name()
        self.tbCurTable= None
        self.conn = None
        self.cur = None

        self._init()
        self._close()

        pass

    # overwrite function 
    def database_Name(self):
        return "dbTest"
        pass

    def config(self):
        return ('localhost','root','Xq123456')
        pass

    def table_Info(self):
        table = {
        }
        return table
        pass

    # private function sql string
    # 根据定义的Item,创建Table
    def _createdbsql(self,databasename):
        sql = "CREATE database if not exists %s character set utf8;" % databasename
        #print sql
        return sql
        pass

    def _createtablesql(self,tablename,item):
        easysql = 'CREATE TABLE if not exists ' + tablename + ' ( id int(10) unsigned NOT NULL AUTO_INCREMENT,%s PRIMARY KEY (id));'

        table = "%s varchar(80) NOT NULL DEFAULT '',"* len(item.fields.keys())
        table = table % tuple(item.fields.keys())
        easysql = easysql % table
        #print easysql 
        return easysql
        pass

    def _dropdatabasesql(self,db):
        sql = 'DROP DATABASE if exists ' + db

        return sql
        pass

    def _createinsertSql(self,tablename,data):

        sql = "INSERT INTO " + tablename + " (" + ",".join(data.keys()) + ") VALUES ('" + "','".join(data.values()).encode('UTF-8') + "')"
        #print sql
        return sql
        pass

    # private function
    def _init(self):
        try:
            # connect
            self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
            self.cur=self.conn.cursor()

            # delete database
            self.cur.execute(self._dropdatabasesql(self.dbName))  
            # create database
            self.cur.execute(self._createdbsql(self.dbName))
            self.conn.select_db(self.dbName)

            # create tables
            for item in self.table_Info().items():
                self.cur.execute(self._createtablesql(item[0],item[1]))
                pass
            pass

        except pymysql.Error,e:
            print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
        finally:
            pass
        pass

    def _open(self):
        try:
            self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
            self.cur=self.conn.cursor()   
            self.conn.select_db(self.dbName)    

        except pymysql.Error,e:
            print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
            self._close()
        finally:
            pass
        pass

    def _close(self):
        self.cur.close()
        self.conn.close()
        self.conn = None
        self.cur = None
        pass

    # insert
    def insert(self,tbName,item):
        try:
            self.tbCurTable = tbName
            self._open()

            sql = self._createinsertSql(tbName,item)
            self.cur.execute(sql)
            self.conn.commit()

            self._close()

        except pymysql.Error,e:
            print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
        finally:
            pass

        pass


class Mp4BaSql(SqlBase):

    def table_Info(self):
        table = {
        'tbMovieItem':MovieItem(),
        'tbMovieDetailItem':MovieDetailItem()
        }
        return table
        pass

0x01 问题思考

由于现在的代码中的proxy.py中的代理很多不能用了,所以考虑,再写个scrapy去爬代理的信息。

代码:https://git.oschina.net/xuqi1987/scrapy.git

你可能感兴趣的:(学习笔记)