Python爬虫之Pyspider框架实战

pyspider是个蛮简洁的框架,爬取内容直接存放在resultdb里,可以web查看,超级方便实用。

废话少说,开始实战吧

本项目目的:

使用pyspider爬取顶点小说网的小说,并存入本地mysql数据库

思路:

代码逻辑很简单,先爬取小说分类的url,沿着分类爬取各类目下的小说名,然后再爬取各章节,最后获取到每章节内容,把需要的各个信息存入数据库

步骤:

1,pyspider all启动pyspider

2,新建一个项目

3,输入代码Handler

这里重点用到了response的save用来保存数据,以及覆盖了on_result以便存储到本地数据库

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2017-07-20 16:56:22

# Project: dingdian

from pyspider.libs.base_handler import *

import re

from bs4 import BeautifulSoup

from pyspider.result import ResultWorker

from pyspider.database.mysql.mysqldb import SQL

class Handler(BaseHandler):

crawl_config = {

}

@every(minutes=24 * 60)

def on_start(self):

baseurl = 'http://www.x23us.com/class/'

sufix = '_1.html'

for i in range(1,11):

url = baseurl + str(i) + sufix

self.crawl(url,callback=self.index_page,validate_cert=False)

@config(age=10 * 24 * 60 * 60)

def index_page(self, response):

total_page_num = response.doc('.last').text()

total_page_num = int(total_page_num)

first = response.doc('.first').text()

first = int(first)

baseurl = 'http://www.x23us.com/class/1_'

sufix='.html'

for index in range(first,total_page_num+1):

url = baseurl + str(index) + sufix

self.crawl(url,callback=self.list_books,validate_cert=False)

def list_books(self, response):

items = response.doc('tr').items()

for item in items:

booktitle = item.find('.L').find('a').eq(1).text()

if len(booktitle)==0:

continue

c = item.find('.C')

author = item.find('.C').eq(0).text()

updatetime = item.find('.C').eq(1).text()

status = item.find('.C').eq(2).text()

latestchapter = item.find('.L').eq(1).text()

bookurl = item.find('.L').find('a').eq(1).attr('href')

savedata = {'booktitle':booktitle,'author':author,'updatetime':updatetime,'status':status}

self.crawl(bookurl,callback=self.list_chapter,save = savedata,validate_cert=False)

def list_chapter(self,response):

items = response.doc('.L').items()

booktitle = response.save['booktitle']

author = response.save['author']

updatetime = response.save['updatetime']

status = response.save['status']

for item in items:

chaptertitle = item.find('a').text()

chapterurl = item.find('a').attr('href')

savedata = {'booktitle':booktitle,'author':author,'updatetime':updatetime,'status':status,'chaptertitle':chaptertitle}

self.crawl(chapterurl,callback=self.list_content,save = savedata,validate_cert=False)

@config(priority=2)

def list_content(self,response):

nav=response.doc('dt > a').items()

navlist = []

for item in nav:

navlist.append(item.text())

if len(navlist) > 0:

category = navlist[1]

items = response.doc('h1').items()

prevnexturls = response.doc('h3').items()

contents = response.doc('#contents').items()

booktitle = response.save['booktitle']

author = response.save['author']

updatetime = response.save['updatetime']

status = response.save['status']

chaptertitle = response.save['chaptertitle']

for item in contents:

content = item.text()

for item in prevnexturls:

prevurl = item.find('a').eq(0).attr('href')

nexturl = item.find('a').eq(2).attr('href')

#for item in items:

#    chaptertitle = item.text()

return {

"booktitle":booktitle,

"author":author,

"updatetime":updatetime,

"status":status,

"category":category,

"chaptertitle":chaptertitle,

"content":content

}

def on_result(self, result):

if not result or not result['booktitle']:

return

sql = SQL()

sql.replace('novel',**result)

其他的代码都很简单,重点说下存入本地数据库,

首先需要在C:\Python3.5\Lib\site-packages\pyspider\database\mysql目录下新建一个mysqldb.py模块,然后输入:

from six import itervalues

# import mysql.connector

import pymysql

from datetime import date, datetime, timedelta

class SQL:

username = 'root'

password = ''

database = 'dingdian'

host = 'localhost'

connection = ''

charset = 'utf8'

connect = True

placeholder = '%s'

def __init__(self):

if self.connect:

SQL.connect(self)

def escape(self,string):

return '`%s`' % string

def connect(self):

config={'user':SQL.username,'password':SQL.password,'host':SQL.host,'charset':SQL.charset}

if SQL.database != None:

config['database'] = SQL.database

try:

cnx = pymysql.connect(**config)

# cnx = mysql.connector.connect(**config)

SQL.connection = cnx

return True

except Exception as err:

print('Something went wrong',err)

def replace(self,tablename=None,**values):

if SQL.connection == '':

print('Please connect first')

return False

tablename = self.escape(tablename)

if values:

_keys = ",".join(self.escape(k) for k in values)

_values = ",".join([self.placeholder,]*len(values))

sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename,_keys,_values)

else:

sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename

cur = SQL.connection.cursor()

try:

if values:

cur.execute(sql_query,list(itervalues(values)))

else:

cur.execute(sql_query)

SQL.connection.commit()

return True

except Exception as err:

print("An error occured :{}".format(err))

return False

再通过wamp中的phpmyadmin来新建一个dingdian数据库和novel表,

表的字段包括id,boottitle,chaptertitle,category,author,status,content,updatetime

这样你就实现了顶点小说网的小说爬取了。具体代码

https://github.com/chenxiang2017/spidersamples/tree/master/dingdian/dingdianpyspider

注意,我这里连接mysql用的是pymysql,如果没装,需要pip install pymysql安装下。

你可能感兴趣的:(Python爬虫之Pyspider框架实战)