import requests
import re
import json
import pymysql
import userAgents
from requests.exceptions import ReadTimeout
from requests.exceptions import HTTPError
from requests.exceptions import RequestException
from requests.exceptions import ConnectionError
from multiprocessing import Pool
#提取网页
def get_one_page(url):
try:
#UserAgent:
# 我们简称UA,指用户代理,是代表用户行为的软件所提供的一个标识自己身份的标识符,包含了用户正在使用的设备以及软
# 件的一些信息(例如操作系统及版本、CPU类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等)。
# 作用:伪装成浏览器,防止被被爬网站进行反爬虫
response = requests.get(url,headers=userAgents.pcUserAgent.get("Firefox 4.0.1 – Windows"))
#print(response.encoding)
#判断是否请求成功时
if response.status_code == requests.codes.ok:
html = response.text
return html
return None
except ReadTimeout:
print("ReadTimeout")
except HTTPError:
print("Http error")
except ConnectionError:
print("Connection error")
except RequestException:
print("Error")
tempSave = ""
#解析网页
def _prase_one_page(html):
try:
pattern = re.compile('.*?.*?(.*?).*?(.*?).*?(.*?).*?..(.*?)人.*?.*?(.*?).*?',re.S)
items = re.findall(pattern,html)
#print(items)
for item in items:
for i in range(0,len(item[1])):
if item[1][i] == '\n':
tempSave = item[1][0:i]
elif item[1][i] == ':':
tempSave = tempSave + item[1][i:]
if len(tempSave) > 0:
for j in range(0,len(tempSave)):
if tempSave[j] == '<':
tempSave = tempSave[0:j]
break
elif len(tempSave) == 0:
tempSave = item
saveItem = ""
saveItems = item[2]
saveItem = saveItems.split('/')
n = len(saveItem)
money = saveItem[n-1]
date = saveItem[n-2]
press = saveItem[n-3]
author = ""
for l in range(0,n-3):
if l < n-4:
author = author + saveItem[l] + "/"
else:
author = author + saveItem[l]
#yield 的作用就是把一个函数变成一个 generator,获得了迭代能力,比起用类的实例保存状态来计算下一个 next() 的值,不仅代码简洁,而且执行流程异常清晰。
yield{
'图片链接': item[0].strip(),
'书名': tempSave.strip(),
#'书名': item[1].strip(),
'作者': author.strip(),
'价格': money.strip(),
'日期': date.strip(),
'出版社':press.strip(),
'评分': item[3].strip(),
'评价人数': item[4].strip(),
'简介': item[5].strip()
}
except :
print("wrong1")
#将提取的内容写入Result.txt
def write_to_db(content):
try:
# 打开数据库连接
db = pymysql.Connect("localhost","root","laotu123","douban",charset = 'utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# 这里写sql语句就可以了,我这写一个插入的语句,这个表里只有一个字段
content2 = []
#for m in range(0,len(content)):
#content2 = content["图片链接"]
#print(content2)
sql = "INSERT INTO `douban`.`doubandata` (`name`, `author`, `price`, `date`, `press`, `scode`, `numOfPeople`, `introdution`, `imageLink`) VALUES ('" + content["书名"] + "', '" + content["作者"]+ "', '" + content["价格"] + "', '" + content["日期"] + "', '" + content["出版社"] + "', '" + content["评分"] + "', '" + content["评价人数"] + "', '" + content["简介"] + "', '" + content["图片链接"]+ "');"
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
except:
# 如果发生错误则回滚
db.rollback()
# 关闭数据库连接
db.close()
except:
print("wrong2")
#依次获取每一页的内容
def main(offset):
try:
#url="https://maoyan.com/board/4?offset="+str(offset)
url="https://book.douban.com/top250?start="+str(offset)
html=get_one_page(url)
#items=_prase_one_page(html)
for item in _prase_one_page(html):
#print(item)
write_to_db(item)
#print(html)
except :
print("wrong2")
try:
if __name__=="__main__":
#for i in range(5):
# main(i*25)
#使用多线程提高爬取速度
#创建进程池
pool = Pool()
#第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数
pool.map(main,[i*25 for i in range(5)])
except:
print("wrong2")
#userAgent 属性是一个只读的字符串,声明了浏览器用于 HTTP 请求的用户代理头的值。
pcUserAgent = {
"safari 5.1 – MAC":{"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"},
"safari 5.1 – Windows":{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"},
"IE 9.0":{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"},
"IE 8.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
"IE 7.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
"IE 6.0":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
"Firefox 4.0.1 – MAC":{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
"Firefox 4.0.1 – Windows":{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
"Opera 11.11 – MAC":{"User-Agent":"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
"Opera 11.11 – Windows":{"User-Agent":"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
"Chrome 17.0 – MAC":{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"},
"Maxthon":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
"Tencent TT":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)"},
"The World 2.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"},
"The World 3.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"},
"sogou 1.x":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"},
"360":{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"},
"Avant":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)"},
"Green Browser":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}
}
mobileUserAgent = {
"iOS 4.33 – iPhone":{"User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"iOS 4.33 – iPod Touch":{"User-Agent":"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"iOS 4.33 – iPad":{"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"},
"Android N1":{"User-Agent":" Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"},
"Android QQ":{"User-Agent":" MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"},
"Android Opera ":{"User-Agent":" Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10"},
"Android Pad Moto Xoom":{"User-Agent":" Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13"},
"BlackBerry":{"User-Agent":"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+"},
"WebOS HP Touchpad":{"User-Agent":" Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0"},
"Nokia N97":{"User-Agent":"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124"},
"Windows Phone Mango":{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)"},
"UC":{"User-Agent":"UCWEB7.0.2.37/28/999"},
"UC standard":{"User-Agent":"NOKIA5700/ UCWEB7.0.2.37/28/999"},
"UCOpenwave":{"User-Agent":" Openwave/ UCWEB7.0.2.37/28/999"},
"UC Opera":{"User-Agent":" Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"}
}
上面的代码抛出异常写得不正确,大家使用代码时可以更正一下!