python爬取豆瓣top前200书籍信息

import requests
import re
import json
import pymysql
import userAgents

from requests.exceptions import ReadTimeout
from requests.exceptions import HTTPError
from requests.exceptions import RequestException
from requests.exceptions import ConnectionError
from multiprocessing import Pool

#提取网页
def get_one_page(url):
    try:
        #UserAgent:
        #   我们简称UA,指用户代理,是代表用户行为的软件所提供的一个标识自己身份的标识符,包含了用户正在使用的设备以及软
        #   件的一些信息(例如操作系统及版本、CPU类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等)。
        #   作用:伪装成浏览器,防止被被爬网站进行反爬虫
        response = requests.get(url,headers=userAgents.pcUserAgent.get("Firefox 4.0.1 – Windows"))
        #print(response.encoding)
        #判断是否请求成功时
        if response.status_code == requests.codes.ok:
            html = response.text
            return html
        return None
    except ReadTimeout:
        print("ReadTimeout")
    except HTTPError:
        print("Http error")
    except ConnectionError:
        print("Connection error")
    except RequestException:
        print("Error")

tempSave = ""

#解析网页
def _prase_one_page(html):
    try:
        pattern = re.compile('.*?.*?(.*?).*?(.*?)

.*?(.*?).*?..(.*?)人.*?.*?(.*?).*?',re.S) items = re.findall(pattern,html) #print(items) for item in items: for i in range(0,len(item[1])): if item[1][i] == '\n': tempSave = item[1][0:i] elif item[1][i] == ':': tempSave = tempSave + item[1][i:] if len(tempSave) > 0: for j in range(0,len(tempSave)): if tempSave[j] == '<': tempSave = tempSave[0:j] break elif len(tempSave) == 0: tempSave = item saveItem = "" saveItems = item[2] saveItem = saveItems.split('/') n = len(saveItem) money = saveItem[n-1] date = saveItem[n-2] press = saveItem[n-3] author = "" for l in range(0,n-3): if l < n-4: author = author + saveItem[l] + "/" else: author = author + saveItem[l] #yield 的作用就是把一个函数变成一个 generator,获得了迭代能力,比起用类的实例保存状态来计算下一个 next() 的值,不仅代码简洁,而且执行流程异常清晰。 yield{ '图片链接': item[0].strip(), '书名': tempSave.strip(), #'书名': item[1].strip(), '作者': author.strip(), '价格': money.strip(), '日期': date.strip(), '出版社':press.strip(), '评分': item[3].strip(), '评价人数': item[4].strip(), '简介': item[5].strip() } except : print("wrong1") #将提取的内容写入Result.txt def write_to_db(content): try: # 打开数据库连接 db = pymysql.Connect("localhost","root","laotu123","douban",charset = 'utf8') # 使用cursor()方法获取操作游标 cursor = db.cursor() # 这里写sql语句就可以了,我这写一个插入的语句,这个表里只有一个字段 content2 = [] #for m in range(0,len(content)): #content2 = content["图片链接"] #print(content2) sql = "INSERT INTO `douban`.`doubandata` (`name`, `author`, `price`, `date`, `press`, `scode`, `numOfPeople`, `introdution`, `imageLink`) VALUES ('" + content["书名"] + "', '" + content["作者"]+ "', '" + content["价格"] + "', '" + content["日期"] + "', '" + content["出版社"] + "', '" + content["评分"] + "', '" + content["评价人数"] + "', '" + content["简介"] + "', '" + content["图片链接"]+ "');" try: # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 db.commit() except: # 如果发生错误则回滚 db.rollback() # 关闭数据库连接 db.close() except: print("wrong2") #依次获取每一页的内容 def main(offset): try: #url="https://maoyan.com/board/4?offset="+str(offset) url="https://book.douban.com/top250?start="+str(offset) html=get_one_page(url) #items=_prase_one_page(html) for item in _prase_one_page(html): #print(item) write_to_db(item) #print(html) except : print("wrong2") try: if __name__=="__main__": #for i in range(5): # main(i*25) #使用多线程提高爬取速度 #创建进程池 pool = Pool() #第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数 pool.map(main,[i*25 for i in range(5)]) except: print("wrong2")

 

userAgents.py

#userAgent 属性是一个只读的字符串,声明了浏览器用于 HTTP 请求的用户代理头的值。

pcUserAgent = {
"safari 5.1 – MAC":{"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"},
"safari 5.1 – Windows":{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"},
"IE 9.0":{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"},
"IE 8.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
"IE 7.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
"IE 6.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
"Firefox 4.0.1 – MAC":{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
"Firefox 4.0.1 – Windows":{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
"Opera 11.11 – MAC":{"User-Agent":"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
"Opera 11.11 – Windows":{"User-Agent":"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
"Chrome 17.0 – MAC":{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"},
"Maxthon":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
"Tencent TT":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)"},
"The World 2.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"},
"The World 3.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"},
"sogou 1.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"},
"360":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"},
"Avant":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)"},
"Green Browser":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}
}

mobileUserAgent = {
"iOS 4.33 – iPhone":{"User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"iOS 4.33 – iPod Touch":{"User-Agent":"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"iOS 4.33 – iPad":{"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"Android N1":{"User-Agent":" Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"},
"Android QQ":{"User-Agent":" MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"},
"Android Opera ":{"User-Agent":" Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"},
"Android Pad Moto Xoom":{"User-Agent":" Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13"},
"BlackBerry":{"User-Agent":"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+"},
"WebOS HP Touchpad":{"User-Agent":" Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0"},
"Nokia N97":{"User-Agent":"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124"},
"Windows Phone Mango":{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)"},
"UC":{"User-Agent":"UCWEB7.0.2.37/28/999"},
"UC standard":{"User-Agent":"NOKIA5700/ UCWEB7.0.2.37/28/999"},
"UCOpenwave":{"User-Agent":" Openwave/ UCWEB7.0.2.37/28/999"},
"UC Opera":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"}
}

 

上面的代码抛出异常写得不正确,大家使用代码时可以更正一下!

你可能感兴趣的:(python)