环境win10,python
只爬取了书的名字,留个纪念。
# -*- coding: utf-8 -*-
import requests
import re
import sys
import random
reload(sys)
sys.setdefaultencoding("utf-8")
#生成随机头
def randHeader():
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
class spider(object):
def __init__(self):
print ('start...')
#获取网页源码
def getSource(self, url):
headers = randHeader()
html = requests.get(url, headers=headers, timeout=20)
return html.text
#获取URL库
def changePage(self, url, totalPage):
page_group=[]
for i in range(1, totalPage):
#当当网址url = 'http://category.dangdang.com/pg1-cp01.45.98.00.00.00.html'
#newPage = re.sub(r'/pg(.*?)-cp', '/pg'+'%s'%(i+1)+'-cp', url, re.S)
#亚马逊网址https://www.amazon.com/s/ref=sr_pg_8?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A10132&page=8&ie=UTF8&qid=1507704586
newPage = re.sub(r'pg_(.*?)\?rh', 'pg_'+'%s'%(i+1)+'?rh', url, re.S)
newPage = re.sub(r'page=(.*?)&', 'page='+'%s'%(i+1)+'&', newPage, re.S)
page_group.append(newPage)
return page_group
#分析每本书的“块”
def getInfo(self, eachBook):
info=[]
#当当
#info = re.findall('
#亚马逊h2 data-attribute="Robicheaux: A Novel (Dave Robicheaux)"
info = re.findall('h2 data-attribute="(.*?)"', eachBook, re.S)
#print(info)
return info
#用csv文件存储数据
def saveInfo(self, bookInfo):
f = open('infoBook_amazon.csv', 'a')
for each in bookInfo:
f.write(each+'\n')
f.close()
if __name__ == '__main__':
bookInfo=[]
#url = 'http://category.dangdang.com/pg2-cp01.58.00.00.00.00.html'
#url = 'http://category.dangdang.com/pg1-cp01.45.62.00.00.00.html'
#url = 'http://category.dangdang.com/pg1-cp01.45.98.00.00.00.html'
#url = 'https://www.amazon.com/s/ref=sr_pg_8?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A10132&page=8&ie=UTF8&qid=1507704586'
#url = 'https://www.amazon.com/s/ref=lp_10399_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A10399&page=3&ie=UTF8&qid=1507711543'
#url = 'https://www.amazon.com/s/ref=lp_10129_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A10129&page=3&ie=UTF8&qid=1507712363'
#url = 'https://www.amazon.com/s/ref=lp_10177_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A10134%2Cn%3A10177&page=3&ie=UTF8&qid=1507713351'
#url = 'https://www.amazon.com/s/ref=lp_9059878011_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A17%2Cn%3A9822%2Cn%3A9823%2Cn%3A9059878011&page=3&ie=UTF8&qid=1507714014'
#url = 'https://www.amazon.com/s/ref=lp_10367653011_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A28%2Cn%3A10367653011&page=3&ie=UTF8&qid=1507719482'
#url = 'https://www.amazon.com/s/ref=lp_3344092011_pg_2?rh=n%3A283155%2Cn%3A%211000%2Cn%3A28%2Cn%3A3344092011&page=2&ie=UTF8&qid=1507720009'
url = 'https://www.amazon.com/s/ref=lp_17437_pg_3?rh=n%3A283155%2Cn%3A%211000%2Cn%3A28%2Cn%3A17437&page=3&ie=UTF8&qid=1507720596'
#print(requests.get(url).text.encode('utf8'))
dangdangSpider = spider()
allLinks = dangdangSpider.changePage(url, 100)
for link in allLinks:
print(u'processing...' + link)
html = dangdangSpider.getSource(link).encode('utf8')
info = dangdangSpider.getInfo(html)
print(info)
dangdangSpider.saveInfo(info)