Python爬虫实例:爬取某个网页的子网页

笔者的目的是对已有的白名单进行细化处理。比如现在有常见域名名单(百度、腾讯、搜狐等等),笔者要做的是对每一个域名爬取其所有的子网站,比如腾讯对应的还有腾讯视频、微信、QQ、腾讯新闻等等。

笔者的输入是一个包含常见域名白名单的xls文件,输出是一个包含白名单细花后的所有网站的xls文件。代码如下:

import tldextract, requests, xlwt, time, random, sys
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from xlrd import open_workbook

#数据初始化
def init():
    global url, headers, workbook, table, row_now, get_url, todo_url, get_domin, count_layer
    get_url = []
    todo_url = []
    get_domin = []
    count_layer = 0
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }
    workbook = xlwt.Workbook(encoding='utf-8')
    table=workbook.add_sheet("name",cell_overwrite_ok=True)
    row_now = 0

#获取当前URL的页面 用BeautifulSoup解析后返回
def GetUrlCodeBS(now_url):
    req = requests.get(now_url, headers=headers)
    if req.encoding == 'ISO-8859-1':
        encodings = requests.utils.get_encodings_from_content(req.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = req.apparent_encoding
    else:
         encoding = req.encoding
    encode_content = req.content.decode(encoding, 'ignore').encode('utf-8', 'ignore')
    return BeautifulSoup(encode_content.decode('utf-8'))
    
#判断当前函数是否可以作为待选项
def TodoUrl(now_url,domain_url):
    global todo_url
    if 'javascript' in now_url:
        return 0
    elif '/' == now_url:
        return 0
    elif '#' in now_url:
        return 0
    elif 'com' in now_url:
        return 0
    elif 'cn' in now_url:
        return 0
    elif '{' in now_url:
        return 0
    else:
        todo_url.append(domain_url+now_url)

#获取当前URL页面的title
def GetUrlTitle(now_url):
    try:
        soup = GetUrlCodeBS(now_url)
        return soup.head.title.text
    except:
        return False    

#获取当前域名下的所有子域名
def GetSubdomain(domain_url,domain_name):
    global get_url, todo_url, get_domin
    now_domain = tldextract.extract(domain_url).domain
    soup = GetUrlCodeBS(domain_url)
    message = soup.find_all('a')
    test_1 = []
    test_1.append(domain_url)
    test_1.append(domain_name)
    get_url.append(test_1)
    for data in message:
        try:
            #判断当前的href属性是否存在
            if hasattr(data,'href')== True:
                #根据不同的href内容执行不同操作
                if 'http' in data['href'] :
                    data_url = urlparse(data['href']).scheme + '://' + urlparse(data['href']).netloc
                    #若当前URL的domain和当前页面的domain相同 认为此URL是当前页面的子网页或者同级网页
                    if tldextract.extract(data_url).domain == now_domain:
                        #若当前URL的地址以及被获取过 则抛弃该URL
                        if urlparse(data_url).netloc in get_domin:
                            continue
                        test_1 = []
                        test_1.append(data_url)
    #                     if '/' in data['href'].replace('//',''):
    #                         url_title = GetUrlTitle(data_url)
    #                         if url_title == False:
    #                             continue
    #                     else:
                        if len(data.text)>10 or len(data.text)==0:
                            continue
                        url_title = data.text
                        test_1.append(url_title)
                        get_domin.append(urlparse(data_url).netloc)
                        get_url.append(test_1)
                        table.write(row_now, 0, url_title)
                        table.write(row_now, 1, data_url)
                        row_now = row_now + 1
                elif 'www' in data['href']:
                    data_url = 'https://' + urlparse(data['href']).netloc
                    if tldextract.extract(data_url).domain == now_domain:
                        if urlparse(data_url).netloc in get_domin:
                            continue
                        test_1 = []
                        test_1.append(data_url)
    #                     if '/' in data['href'].replace('//',''):
    #                         url_title = GetUrlTitle(data_url)
    #                         if url_title == False:
    #                             continue
    #                     else:
                        if len(data.text)>10 or len(data.text)==0:
                            continue
                        url_title = data.text
                        test_1.append(url_title)
                        get_domin.append(urlparse(data_url).netloc)
                        get_url.append(test_1)
                        table.write(row_now, 0, url_title)
                        table.write(row_now, 1, data_url)
                        row_now = row_now + 1
#                     else:
#                         TodoUrl(data['href'],domain_url)
#                 else:
#                     TodoUrl(data['href'],domain_url)
        except:
            continue
# #     for url_new in todo_url:
# #         print(url_new)
# #         time.sleep(random.random()*10)
# #         GetSubdomain_Sub(url_new)

#获取xls文件中的列表,逐个细化
def GetXlsToDetail():
    global get_url, get_domin
    workbook = open_workbook(r'../changjianyuming/res_data/常见域名列表_综合其他.xls')
    sheet = workbook.sheet_by_index(0)
    for i in range(sheet.nrows): 
        try:
            GetSubdomain('https://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
        except:
            try:
                GetSubdomain('http://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
            except:
                try:
                    GetSubdomain('https://'+sheet.row_values(i)[1],sheet.row_values(i)[0])
                except:
                    try:
                        GetSubdomain('http://'+sheet.row_values(i)[1],sheet.row_values(i)[0]) 
                    except:
                        print('获取' + sheet.row_values(i)[1] + '网页代码失败!')
                        continue
        get_url = []
        get_domin = []
        

#主函数
if __name__ == '__main__':
    init()
    GetXlsToDetail()
    workbook.save('../changjianyuming/res_data/常见域名列表_综合其他_细化.xls')

 

你可能感兴趣的:(Python爬虫)