# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import urllib
import urllib2
import shutil
from pyExcelerator import *
# 取消证书验证
context = ssl._create_unverified_context()
# 爬虫基础地址
baseUrl = 'https://www.qiushibaike.com/text/page/{}/'
# 匹配昵称及头像的正则表达式
pattern = ''
# 头像储存目录
resourcePath = 'd:/Reptilian/health/content/'
# 图片储存目录
imgPath = resourcePath + 'img/'
# 创建工作簿
w = Workbook()
ws = w.add_sheet('1')
count = 1
# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# 获取要爬取的资源地址
def buildUrl(page):
return baseUrl.format(str(page))
# 获取用户信息方法
def getUserInfo(page):
url = buildUrl(page)
print(url)
req = urllib2.Request(url, headers = headers)
resp = urllib2.urlopen(req, context = context)
rStr = str(resp.read());
userList = re.compile(pattern).findall(rStr)
print(('第' + str(page) + '页动态条数:' + str(len(userList))).decode('utf-8').encode('gbk'));
l = len(userList)
global count
for user in userList:
imgUrl = 'http://' + user[0]
ws.write (count, 1, imgUrl)
ws.write (count, 0, user[3].decode('utf-8').encode('gbk').decode('gbk'))
count += 1
downloadImg(imgUrl)
#print(('头像地址:' + user[0]).decode('utf-8').encode('gbk'))
#print(('用户昵称:' + user[3]).decode('utf-8').encode('gbk'))
# 创建存储目录
def mkDir(path):
path = path.strip()
path = path.rstrip("\\")
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
else:
print('目录已存在!'.decode('utf-8').encode('gbk'));
def downloadImg(url):
global imgPath
urllib.urlretrieve(url, imgPath + str(count) + '.jpg')
for i in range(1, 6):
mkDir(resourcePath)
mkDir(imgPath)
ws.write (0, 0, '用户昵称'.decode('utf-8').encode('gbk').decode('gbk'))
ws.write (0, 1, '用户头像地址'.decode('utf-8').encode('gbk').decode('gbk'))
getUserInfo(i)
w.save(resourcePath + 'content.xls')