此爬虫基于BeautifulSoup4为基础情况下,模仿scrapy的结构进行编码
在代码里增添了两个额外的py代码
第一个是resource,里面包涵请求头和代理ip,其中的代理ip可以从西刺代理中爬虫获取
UserAgents =[
'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
'Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
'Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19',
'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
'Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
]
PROXIES = [
'111.155.116.210:8123'
'61.135.217.7:80'
'122.114.31.177:808'
'27.15.22.242:8118'
'121.31.195.245:8123'
'218.20.218.59:8118'
'58.216.202.149:8118'
'180.113.168.23:8118'
'58.249.99.188:8118'
'111.192.179.38:8118'
'111.155.116.200:8123'
'221.195.11.152:80'
'180.118.242.95:61234'
'122.7.178.49:8118'
'60.179.40.157:33404'
'27.159.167.96:49330'
'60.23.38.52:80'
'111.155.116.208:8123'
'180.172.159.5:8118'
'111.155.116.217:8123'
'121.31.102.230:8123'
'171.39.45.223:8123'
'221.224.62.182:3128'
'222.137.200.128:8118'
'110.73.5.206:8123'
'60.168.87.15:808'
'182.35.144.113:8118'
'125.109.197.48:23643'
'182.34.48.112:41480'
'106.58.123.223:80'
'113.121.241.218:808'
'183.165.77.18:8118'
'180.158.109.60:8118'
'114.231.153.193:20013'
'115.217.253.61:808'
'58.48.88.140:8118'
'180.118.241.139:61234'
'171.39.28.93:8123'
'180.119.65.169:808'
'111.155.116.220:8123'
'110.189.207.77:29977'
'42.225.138.157:8118'
'115.151.205.150:808'
'111.155.116.211:8123'
]
第二部分是mylog.py主要是用于写日志文件,我们可以通过查看日志来判断整体的代码在哪一部分出现了问题
#-*- coding:utf-8 -*-
import logging
import getpass
import sys
class MyLog(object):
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name) -10s %(message)-12s\r\n')
self.logHand = logging.FileHandler(self.logFile,encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u"I'm debug 测试中文")
mylog.info("I'm info")
mylog.warn("I'm info")
mylog.error(u"I'm error 测试中文")
mylog.critical("I'm critical")
下面是爬虫代码
from bs4 import BeautifulSoup
import urllib.request
import codecs
import requests
import time
import resource
import random
import urllib.parse
from mylog import MyLog as mylog
class Item(object):
top_num = None#排名;
score = None#得分
mvName = None#mv名字
singer = None#歌手
releasTime = None#发行时间
class getMvList(object):
def __init__(self):
self.urlBase = 'http://vchart.yinyuetai.com/vchart/trends?'
self.areasDic = {'ML':'内地','HT':'港台','US':'美国','KR':'日本','JP':'日本'}
self.log = mylog()
self.geturls()
def geturls(self):
areas = ['ML','HT','US','KR','JP']
pages = [str(i) for i in range(1,4)]
for area in areas:
urls =[]
for page in pages:
urlend = 'area=' + area +'&page=' + page
url = self.urlBase + urlend
urls.append(url)
self.log.info(u'添加URL:%s 到URLS' %url)
self.spider(area,urls)
def getRseponseContent(self,url):
proxy = urllib.request.ProxyHandler({'http':'http://' + self.getRandomProxy()})
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
try:
res = requests.get(url, timeout=30, headers={'User-Agent': self.getRandomHeaders()})
res.raise_for_status()
res.encoding = res.apparent_encoding
#print(response.read().decode("utf-8"))
time.sleep(1)
except:
self.log.error(u'Python 返回URL:%s 数据失败'%url)
else:
self.log.info(u'Python 返回URL:%s 数据成功'%url)
return res.text
def spider(self,area,urls):
items = []
for url in urls:
responseContent = self.getRseponseContent(url)
if not responseContent:
continue
soup = BeautifulSoup(responseContent,'lxml')
tags = soup.find_all('li',attrs={'name':'dmvLi'})
for tag in tags:
item = Item()
item.top_num = tag.find('div',attrs={'class':'top_num'}).get_text()
if tag.find('h3',attrs={'class':'desc_score'}):
item.score = tag.find('h3',attrs={'class':'desc_score'}).get_text()
else:
item.score = tag.find('h3',attrs={'class':'asc_score'}).get_text()
item.mvName = tag.find('a',attrs = {'class': 'mvname'}).get_text()
item.singer = tag.find('a',attrs = {'class':'special'}).get_text()
item.releasTime = tag.find('p',attrs = {'class':'c9'}).get_text()
items.append(item)
self.log.info(u'添加mvName为<<%s>>的数据成功'%(item.mvName))
self.piplines(items,area)
def getRandomProxy(self):
return random.choice(resource.PROXIES)
def getRandomHeaders(self):
return random.choice(resource.UserAgents)
def piplines(self,items,area):
fileName = 'mvTopList.txt'
nowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
with codecs.open(fileName,'a','utf8') as fp:
fp.write('%s ------------------------------------------------------- %s\r\n'%(self.areasDic.get(area),nowTime))
for item in items:
fp.write('%s %s \t %s \t %s \t %s \r\n'%(item.top_num,item.score,item.releasTime,item.mvName,item.singer))
self.log.info(u'添加mvName为<<%s>>的数据成功' % (item.mvName))
fp.write('\r\n'*4)
if __name__ == '__main__':
GML = getMvList()
11-16是参考了scrapy中的item.24-34行将不同榜的前top50的网页存到列表中.35-49行是增加请求头以及代理ip其中,请求头部分用了一个写的随机函数,从resource中随机挑选一个请求头,代理ip部分也是.36-38部分为代理ip,40-42部分为增添请求头,51-69部分是模仿scrapy中的spider模块,主要是通过返回的网页用BeautifulSoup对网页进行解析,获取所需要的内容.76-83是将抓取内容以txt格式进行存取