代码仅供学习交流,请勿用于非法用途
import requests
import re
'''
@Author:王磊
@Time :2018/11/15 15:29:06
'''
def getHTML(url):
'''
返回页面响应数据
:param url:
:return:
'''
res = requests.get(url)
return res.content.decode(res.apparent_encoding, 'ignore')
def getImages(html):
'''
通过html获取图片地址,并处理拼接完整地址
:param html:
:return:
'''
pat = re.compile(r'' )
hostUrl = 'http://www.eastmountyxz.com/'
urls = re.findall(pat, html)
index = 0
for url in urls:
urls[index] = hostUrl + url[2:]
index += 1
return urls
def getArticals(html):
'''
通过html获取文章内容
:param html:
:return:
'''
pat_title_url = re.compile(r'(.*?)
')
res_tuple = re.findall(pat_title_url, html)
pat_content = re.compile(r'(.*?)
', re.S)
res_contents = re.findall(pat_content, html)
index = 0
res = []
for _ in res_tuple:
res.append([res_tuple[index][0], res_tuple[index][1], res_contents[index]])
index += 1
return res
def getUserInfo(html):
'''
通过html获取个人信息
:param hmtl:
:return:
'''
pat = re.compile(r'(.*?)
')
info = re.findall(pat, html)
return info
def run():
'''
主函数
:return:
'''
url = 'http://www.eastmountyxz.com'
html = getHTML(url)
'''获取数据部分'''
imgURLs = getImages(html)
articalContent = getArticals(html)
info = getUserInfo(html)
'''打印数据部分'''
for _ in range(len(articalContent)):
print("第%d篇文章:" % (_ + 1))
print("文章标题:%s" % articalContent[_][1])
print("文章内容:%s" % articalContent[_][2])
print("Url地址:%s" % articalContent[_][0])
print("\r\n")
print("用户个人信息:")
for _ in info:
print(_)
'''图片下载部分'''
index = 0
for _ in imgURLs:
resp = requests.get(_)
with open("c:Users/asus/Desktop/pc/img/" + str(index) + ".gif", "wb") as f:
f.write(resp.content)
f.close()
index += 1
if __name__ == '__main__':
run()