scrapy startproject movieproject
scrapy genspider -t crawl mp4ba mp4ba.com
大致结构
├── movieproject
│ ├── agents.py
│ ├── __init__.py
│ ├── items.py
│ ├── log.py
│ ├── middlewares.py
│ ├── mysql.py
│ ├── pipelines.py
│ ├── proxy.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── mp4ba.py
└── scrapy.cfg
mp4ba.py中的代码
# -*- coding: utf-8 -*-
class Mp4baSpider(CrawlSpider):
name = "mp4ba"
allowed_domains = ["mp4ba.com"]
start_urls = (
u'http://www.mp4ba.com/',
)
rules = (
Rule(LinkExtractor(allow=r'\index.php\?(.)*page=(\d)*'), callback='parse_page', follow=True),
Rule(LinkExtractor(allow=r'show.php\?hash=(.)*'), callback='parse_detail', follow=False),
)
def parse_page(self, response):
items = []
for data in response.xpath("//tbody[@id='data_list']/tr[@class='alt1']"):
item = MovieItem()
item['publish_time'] = u''.join(data.xpath("td[1]/text()").extract())
item['category'] = u''.join(data.xpath("td[2]/a[@href]/text()").extract())
item['name'] = u''.join(data.xpath("td[3]/a[@href]/text()").extract()).strip()
item['size'] = u''.join(data.xpath("td[4]/text()").extract())
item['download_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='btl_1']/text()").extract())
item['detail_link'] = u''.join(data.xpath("td[3]/a[@href]/@href").extract())
item['torrert_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='bts_1']/text()").extract())
items.append(item)
return items
pass
def parse_detail(self, response):
item = MovieDetailItem()
item['detail_link'] = response.url
item['download_link'] = response.xpath("//p[@class='original download']/a[@id='download']/@href").extract()[0]
item['magnet_link'] = response.xpath("//p[@class='original magnet']/a[@id='magnet']/@href").extract()[0]
return item
pass
pipelines.py
class MovieSavePipeline(object):
def __init__(self):
self.linecount = 0
#创建一个数据库处理对象
self.db = Mp4BaSql()
def process_item(self, item, spider):
self.linecount = self.linecount +1
if type(item) is MovieItem:
print "%d.\t%s" %(self.linecount,item['name'])
self.db.insert('tbMovieItem',item)
elif type(item) is MovieDetailItem:
print item['download_link']
self.db.insert('tbMovieDetailItem',item)
else:
pass
return item
mysql.py
#!/usr/bin/python
#coding=utf-8
#
import pymysql
from movieproject.items import MovieItem,MovieDetailItem
class SqlBase(object):
def __init__(self):
self.dbName= self.database_Name()
self.tbCurTable= None
self.conn = None
self.cur = None
self._init()
self._close()
pass
# overwrite function
def database_Name(self):
return "dbTest"
pass
def config(self):
return ('localhost','root','Xq123456')
pass
def table_Info(self):
table = {
}
return table
pass
# private function sql string
# 根据定义的Item,创建Table
def _createdbsql(self,databasename):
sql = "CREATE database if not exists %s character set utf8;" % databasename
#print sql
return sql
pass
def _createtablesql(self,tablename,item):
easysql = 'CREATE TABLE if not exists ' + tablename + ' ( id int(10) unsigned NOT NULL AUTO_INCREMENT,%s PRIMARY KEY (id));'
table = "%s varchar(80) NOT NULL DEFAULT '',"* len(item.fields.keys())
table = table % tuple(item.fields.keys())
easysql = easysql % table
#print easysql
return easysql
pass
def _dropdatabasesql(self,db):
sql = 'DROP DATABASE if exists ' + db
return sql
pass
def _createinsertSql(self,tablename,data):
sql = "INSERT INTO " + tablename + " (" + ",".join(data.keys()) + ") VALUES ('" + "','".join(data.values()).encode('UTF-8') + "')"
#print sql
return sql
pass
# private function
def _init(self):
try:
# connect
self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
self.cur=self.conn.cursor()
# delete database
self.cur.execute(self._dropdatabasesql(self.dbName))
# create database
self.cur.execute(self._createdbsql(self.dbName))
self.conn.select_db(self.dbName)
# create tables
for item in self.table_Info().items():
self.cur.execute(self._createtablesql(item[0],item[1]))
pass
pass
except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
pass
pass
def _open(self):
try:
self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
self.cur=self.conn.cursor()
self.conn.select_db(self.dbName)
except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
self._close()
finally:
pass
pass
def _close(self):
self.cur.close()
self.conn.close()
self.conn = None
self.cur = None
pass
# insert
def insert(self,tbName,item):
try:
self.tbCurTable = tbName
self._open()
sql = self._createinsertSql(tbName,item)
self.cur.execute(sql)
self.conn.commit()
self._close()
except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
pass
pass
class Mp4BaSql(SqlBase):
def table_Info(self):
table = {
'tbMovieItem':MovieItem(),
'tbMovieDetailItem':MovieDetailItem()
}
return table
pass
由于现在的代码中的proxy.py中的代理很多不能用了,所以考虑,再写个scrapy去爬代理的信息。
代码:https://git.oschina.net/xuqi1987/scrapy.git