Python3 爬虫-提取请求页面所有的真实url-BeautifulSoup

在 HTML中   表示超链接,所以要是提取页面 url 的话就是提取 ‘xx’

方法一:find_all

import urllib
import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup

word = '周杰伦'
# word为关键词,pn是百度用来分页的..
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
# 通过 url 获取域名
res = urlparse(url)
domain = res.netloc
print(domain)
print('- - '*30)

response = request.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.find_all('h3')  # 返回 list
tagh3 = soup.find_all('a')  # 获取所有 a 标签下内容,返回 list
all = open(r'F:\security\web\output\report\test.txt', 'w+')
hrefs = []
for h3 in tagh3:
    # href = h3.find('a').get('href')
    try:
        href = h3.get('href')  # 获取 a 标签下 href 的属性值(即:url)
    except:
        pass
    else:
        hrefs.append(href)
hrefs = list({}.fromkeys(hrefs).keys())  # 去重
for href in hrefs:
    if href == None:
        hrefs.remove(href)
for href in hrefs:
    if href.strip().startswith('http://') or href.strip().startswith('https://'):
        response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到网页原始地址
        except Exception as e:
            all.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                all.write(real_url + '\n')
    else:
        l = href.split(domain)
        if len(l) == 1:
            if href.startswith('/'):
                href = 'http://' + domain + href
            else:
                href = 'http://' + domain + '/' + href
        else:
            href = "http://" + domain + l[1]
        response_url = requests.get(url=href.strip(), headers=headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到网页原始地址
        except Exception as e:
            all.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                all.write(real_url + '\n')
all.close()

 

方法二:select

import urllib
import requests
from urllib.parse import urlparse
from urllib import request
from bs4 import BeautifulSoup

word = '周杰伦'
# word为关键词,pn是百度用来分页的..
url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
res = urlparse(url)
domain = res.netloc
print(domain)
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.select('h3 > a[href]')  # 同级标签之间不需要空格隔开, 不同级标签要用空格隔开
tags = soup.select('a[href]')  # 返回 list
hrefs = []
for tag in tags:
    hrefs.append(tag.get('href'))  # 提取 href 的内容(即:url)
herfs = list({}.fromkeys(hrefs).keys())  # 去重
fw2 = open(r'F:\..\..\demo2.txt', 'w+')
for href in hrefs:
    if href.strip().startswith('http://') or href.strip().startswith('https://'):
        response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        try:
            real_url = response_url.headers['Location']  # 得到网页原始地址
        except Exception as e:
            fw2.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                fw2.write(real_url + '\n')
    else:
        l = href.strip().split(domain)
        if len(l) == 1:
            if href.strip().startswith('/'):
                href = 'http://' + domain + href.strip()
            else:
                href = 'http://' + domain + '/' + href.strip()
        else:
            href = "http://" + domain + l[1]
        # print(href.strip())
        response_url = requests.get(url=href.strip(), headers=param.headers, allow_redirects=False)  # allow_redirects=False 禁止重定向
        # 得到网页原始地址
        try:
            real_url = response_url.headers['Location']
        except Exception as e:
            fw2.write(href.strip() + '\n')
        else:
            if real_url.startswith('http'):
                fw2.write(real_url + '\n')
fw2.close()

 

你可能感兴趣的:(url重定向,Python,python爬虫)