Python爬虫系列之某个人站点信息爬取

Python爬虫系列之某个人站点信息爬取

代码仅供学习交流,请勿用于非法用途

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

  • 以下为代码部分
import requests
import re

'''
    @Author:王磊
    @Time  :2018/11/15 15:29:06
'''


def getHTML(url):
    '''
    返回页面响应数据
    :param url:
    :return:
    '''
    res = requests.get(url)
    return res.content.decode(res.apparent_encoding, 'ignore')


def getImages(html):
    '''
    通过html获取图片地址,并处理拼接完整地址
    :param html:
    :return:
    '''
    pat = re.compile(r'')
    hostUrl = 'http://www.eastmountyxz.com/'
    urls = re.findall(pat, html)
    index = 0
    for url in urls:
        urls[index] = hostUrl + url[2:]
        index += 1
    return urls


def getArticals(html):
    '''
    通过html获取文章内容
    :param html:
    :return:
    '''
    pat_title_url = re.compile(r'

(.*?)

'
) res_tuple = re.findall(pat_title_url, html) pat_content = re.compile(r'

(.*?)

'
, re.S) res_contents = re.findall(pat_content, html) index = 0 res = [] for _ in res_tuple: res.append([res_tuple[index][0], res_tuple[index][1], res_contents[index]]) index += 1 return res def getUserInfo(html): ''' 通过html获取个人信息 :param hmtl: :return: ''' pat = re.compile(r'

(.*?)

'
) info = re.findall(pat, html) return info def run(): ''' 主函数 :return: ''' url = 'http://www.eastmountyxz.com' html = getHTML(url) '''获取数据部分''' imgURLs = getImages(html) articalContent = getArticals(html) info = getUserInfo(html) '''打印数据部分''' for _ in range(len(articalContent)): print("第%d篇文章:" % (_ + 1)) print("文章标题:%s" % articalContent[_][1]) print("文章内容:%s" % articalContent[_][2]) print("Url地址:%s" % articalContent[_][0]) print("\r\n") print("用户个人信息:") for _ in info: print(_) '''图片下载部分''' index = 0 for _ in imgURLs: resp = requests.get(_) with open("c:Users/asus/Desktop/pc/img/" + str(index) + ".gif", "wb") as f: f.write(resp.content) f.close() index += 1 if __name__ == '__main__': run()

你可能感兴趣的:(Python)