Python爬虫小项目:爬一个图书网站

#!/usr/bin/python
#coding:utf-8

import json
import urllib2
import re
from bs4 import BeautifulSoup
import MySQLdb

import sys
reload(sys)
sys.setdefaultencoding('utf8')


def ConnectMysql(book_name,imglist,writer,info,url):
    print book_name,imglist,writer,info,url
    try:
        myconnet=MySQLdb.connect("localhost","root","","db_books",charset="utf8")
    except MySQLdb.OperationalError,message:
        print "数据库连接失败"
    mycursor=myconnet.cursor()
    sql="insert into book_info values('%s','%s','%s','%s','%s')"%(book_name,imglist,writer,info,url)

    mycursor.execute(sql)
    myconnet.commit()
    mycursor.close()
    myconnet.close()

def OpenPage(page):
    Myheader={}
#urllib2.Request 第一个是待爬的url,第二个是我们的请求头headers
    request=urllib2.Request(page,headers=Myheader)
#urlopen发送请求指定请求
    f=urllib2.urlopen(request)
#将对象f 使用read读取相应的内容
    data=f.read()

    return data.decode("GBK",errors="ignore").encode("utf-8")

#解析指定页面内容
def JiexiPage(data):
    soup=BeautifulSoup(data,"html.parser")
    list_ebook=soup.find_all(href=re.compile("thread-"))

    url_list=[]
    for item in list_ebook:
        url_list.append("http://www.51dupdf.com/"+item['href'])


    url_list=list(set(url_list))
    return url_list


#深剖析每一个页面内容
def EachPageJx(url):
    pagedata=OpenPage(url)
    soup=BeautifulSoup(pagedata,"html.parser")
    adress=r'

你可能感兴趣的:(Python爬虫小项目:爬一个图书网站)