HomeWorkDay06---读取5000邮箱

import requests
import re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
url = 'https://www.baidu.com/s?wd=%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1'
response = requests.get(url,headers = headers)
html = response.text
http_ = re.compile('.*?\"(http.*?\//.*?)\".*?')
res = http_.findall(html)
list_ = []
for i in res:
    if 'cache.baiducontent' not in i:
        res.remove(i)
        list_.append(i)
del list_[0]
list_1 = []
for i in list_:
        response = requests.get(i,headers = headers)
        html_1 = response.text
        regex = re.compile("[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?")
        res_1 = regex.findall(html_1)
        print(list_1)

你可能感兴趣的:(DFZY)